<a href="https://colab.research.google.com/github/Hacxmr/log-analysis/blob/main/Supervised_kyoto2006%2B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving 2006.zip to 2006.zip


In [2]:
!pip install pandas numpy scikit-learn matplotlib seaborn







In [3]:
from zipfile import ZipFile

zip_path = '/content/2006.zip'  # update this if filename is different

with ZipFile(zip_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()
    print(file_list[:10])  # show first 10 files


['2006/', '2006/Kyoto2016/', '2006/Kyoto2016/2006/', '2006/Kyoto2016/2006/11/', '2006/Kyoto2016/2006/11/20061101.txt', '2006/Kyoto2016/2006/11/20061102.txt', '2006/Kyoto2016/2006/11/20061103.txt', '2006/Kyoto2016/2006/11/20061104.txt', '2006/Kyoto2016/2006/11/20061105.txt', '2006/Kyoto2016/2006/11/20061106.txt']


In [7]:
import glob
import os
log_files = sorted(glob.glob(os.path.join(extract_dir, '**', '*.txt'), recursive=True))
print(log_files[:5])  # show first few txt files found in nested folders


['/content/kyoto_logs/2006/Kyoto2016/2006/11/20061101.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061102.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061103.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061104.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061105.txt']


In [12]:
print("Files loaded:", len(log_files))
print("First few files:", log_files[:3])


Files loaded: 61
First few files: ['/content/kyoto_logs/2006/Kyoto2016/2006/11/20061101.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061102.txt', '/content/kyoto_logs/2006/Kyoto2016/2006/11/20061103.txt']


In [13]:
import pandas as pd

# Initialize empty list to hold all data
all_data = []

# Loop through the first N files (e.g., 10 to avoid memory issues during testing)
for file in log_files[:10]:  # use log_files for all files if confirmed working
    try:
        df = pd.read_csv(file, sep='\t', header=None, engine='python')
        all_data.append(df)
    except Exception as e:
        print(f"Error reading {file}:", e)

# Concatenate all data
df = pd.concat(all_data, ignore_index=True)

print("Shape:", df.shape)
print(df.head())


Shape: (370942, 24)
             0      1       2      3   4    5    6    7   8   9   ...  14  15  \
0     27.561208   smtp    3179    175   0  0.0  0.0  0.0   0   0  ...   0   0   
1      0.000000  other       0      0   0  0.0  0.0  0.0   0   0  ...   0   0   
2  86366.249616  other  244776      0   0  0.0  0.0  0.0   0   0  ...   0   0   
3   2994.374758  other   15744  18154   0  0.0  0.0  0.0   0   0  ...   0   0   
4      4.749378   smtp    7895    244   0  0.0  0.0  0.0   0   0  ...   0   0   

   16 17                                       18     19  \
0   0  1  fda2:69aa:1f1a:84b0:130d:2736:3fa0:42da   2161   
1   0 -1  fda2:69aa:1f1a:0104:3fff:571a:ff2c:00a5    138   
2   0 -1  fda2:69aa:1f1a:540c:7d80:2750:07a6:28a5  32770   
3   0 -1  fda2:69aa:1f1a:3aef:7af3:3027:3045:7ff2   1400   
4   0  1  fda2:69aa:1f1a:381e:25aa:0bff:12e8:0365   1806   

                                        20    21        22   23  
0  fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e    25  00:00:09  tcp  


In [14]:
df.columns = [
    'date', 'time', 'duration', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port',
    'flag', 'tos', 'pkt_size', 'ttl', 'src_mac', 'dst_mac', 'label',
    'service', 'src_country', 'dst_country', 'unknown1', 'unknown2', 'unknown3',
    'is_attack', 'label_detail', 'extra'
]


In [15]:
print(df.shape[1])  # total number of columns


24


In [16]:
# Drop irrelevant columns
columns_to_keep = ['duration', 'protocol', 'src_port', 'dst_port', 'flag', 'ttl', 'label']
df = df[columns_to_keep]

# Remove rows with NaNs in important features
df = df.dropna()

# Encode categorical values
df['protocol'] = df['protocol'].astype('category').cat.codes
df['flag'] = df['flag'].astype('category').cat.codes

# Convert all other columns to numeric (if needed)
for col in ['duration', 'src_port', 'dst_port', 'ttl', 'label']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Final clean
df = df.dropna()


In [24]:
df['target'] = df['label'].apply(lambda x: 1 if x == 1 else 0)  # 1 = attack, 0 = normal



In [25]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['label', 'target'])
y = df['target']

# Final sanity check
print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


X shape: (350357, 6)
y distribution:
 target
0    350357
Name: count, dtype: int64


In [26]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Confusion Matrix:
 [[105108]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    105108

    accuracy                           1.00    105108
   macro avg       1.00      1.00      1.00    105108
weighted avg       1.00      1.00      1.00    105108



