In [1]:
import json
import pandas as pd
import glob
import geoip2.database
import ipaddress

In [2]:
# Function to read and process log files
def process_log_file(file_path):
    logs = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                log = json.loads(line)
                logs.append(log)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}. Skipping line: {line.strip()}")

    return pd.DataFrame(logs)


In [3]:
# Function to get the country from an IP address
def get_country(ip):
    try:
        if ipaddress.ip_address(ip).is_private:
            return "Private"
        response = reader.city(ip)
        if response is not None:
            return response.country.names['en']
    except Exception as e: 
        print(f"Error: The address {ip} is not in the database.")
    return "Unknown"


In [4]:
# Get all log file names
log_files = glob.glob('cowrie/cowrie.json.*')

In [5]:
all_logs = []

In [6]:
# Process all log files and append the resulting dataframes
for log_file in log_files:
    df = process_log_file(log_file)
    all_logs.append(df)

In [7]:
# Combine all dataframes into a single dataframe
df = pd.concat(all_logs, ignore_index=True)

In [8]:
# Filter authentication logs and command logs
auth_logs = df[df['eventid'].isin(['cowrie.login.success', 'cowrie.login.failed'])]
cmd_logs = df[df['eventid'] == 'cowrie.command.input']

In [9]:
# Clean up authentication logs
auth_logs = auth_logs.assign(src_ip=auth_logs['src_ip'].str.strip(),
                             username=auth_logs['username'].str.strip(),
                             password=auth_logs['password'].str.strip(),
                            )

In [10]:
# Clean up command logs
cmd_logs = cmd_logs.assign(src_ip=cmd_logs['src_ip'].str.strip(),
                           input=cmd_logs['input'].str.strip())

In [11]:
# Merge authentication logs and command logs
merged_logs = pd.merge(auth_logs, cmd_logs, on=['src_ip', 'session'], how='left', suffixes=('_auth', '_cmd'))


In [12]:
# Function to label attacks as severe or non-severe
def label_attack(row):
    if row['eventid_auth'] == 'cowrie.login.success' and pd.notna(row['input_cmd']):
        executed_commands = row['input_cmd'].split()
        if len(executed_commands) >= 1:
            return 1  # Severe attack
    return 0  # Non-severe attack

In [13]:
# Apply the attack labeling function
merged_logs['severity'] = merged_logs.apply(label_attack, axis=1)

In [14]:
# Select relevant columns for the machine learning dataset
ml_dataset = merged_logs[['src_ip', 'username_auth', 'password_auth', 'input_cmd','timestamp_auth', 'severity']]

In [15]:
# Remove duplicate rows and shuffle the dataset
ml_dataset = ml_dataset.drop_duplicates()
ml_dataset = ml_dataset.sample(frac=1).reset_index(drop=True)

In [16]:
# Initialize the geoip2 reader
reader = geoip2.database.Reader('GeoLite2-City.mmdb')  # Replace this with the path to your .mmdb file


In [17]:
# Map IP addresses to countries
ml_dataset['country'] = ml_dataset['src_ip'].apply(get_country)


In [18]:
# Close the geoip2 reader
reader.close()

In [19]:
# Save the dataset to a CSV file
ml_dataset.to_csv('cowrie_honeypot_ml_dataset2.csv', index=False)

In [20]:
# Print the resulting dataset
print(ml_dataset)

                src_ip username_auth password_auth  \
0         223.13.31.85        666666        666666   
1         218.92.0.192          root    Voiture1@3   
2         223.13.31.85         admin  7ujMko0admin   
3       123.185.32.183          root       cat1029   
4      222.246.126.216          root          ikwb   
...                ...           ...           ...   
17228     218.92.0.192          root       dreamer   
17229    152.37.72.238          root          1234   
17230     60.53.206.52          tech          tech   
17231   110.182.96.227          root        Zte521   
17232   103.154.65.189       aaliyah         barby   

                                               input_cmd  \
0                                                    NaN   
1                                                    NaN   
2      cd /dev/shm; cat .s || cp /bin/echo .s; /bin/b...   
3                                            rm .s; exit   
4                                     /bin/busybox 