In [10]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

import seaborn as sns
from scipy import stats

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [11]:
import pandas as pd

internet_data = pd.read_csv('preprocessed_internet_data.csv')

internet_data

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes Sent,Bytes Received,Elapsed Time (sec),Packets Sent,Packets Received,Action
0,57222,53,54587,53,4.553877,4.430817,30,0.693147,0.693147,0
1,56258,3389,56258,3389,7.378384,8.061171,17,2.397895,2.302585,0
2,6881,50321,43265,50321,4.779123,4.795791,1199,0.693147,0.693147,0
3,50553,3389,50553,3389,7.271704,7.544332,17,2.197225,2.079442,0
4,50002,443,45848,443,8.821585,9.829895,16,2.639057,2.944439,0
...,...,...,...,...,...,...,...,...,...,...
65527,63691,80,13237,80,5.262690,4.812184,15,1.609438,1.098612,0
65528,50964,80,13485,80,11.117109,15.344482,77,6.893656,8.213653,0
65529,54871,445,0,0,4.262680,0.000000,0,0.693147,0.000000,2
65530,54870,445,0,0,4.262680,0.000000,0,0.693147,0.000000,2


Implement Random Forest on Full Feature Set

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import pickle

Prepare features and target variable

In [13]:
X = internet_data.drop(columns=['Action'])
y = internet_data['Action']

Split the data into training and testing sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Scale Features

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train the Random Forest Model

In [16]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train_scaled, y_train)

Evaluate the Model (Not needed for this assignment)

In [17]:
y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11330
           1       0.99      1.00      1.00      4485
           2       1.00      1.00      1.00      3830
           3       0.80      0.27      0.40        15

    accuracy                           1.00     19660
   macro avg       0.95      0.82      0.85     19660
weighted avg       1.00      1.00      1.00     19660

Confusion Matrix:
[[11326     4     0     0]
 [    0  4475     9     1]
 [    0    14  3816     0]
 [    0    11     0     4]]


Save the model using Pickle

In [18]:
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)

Load the model and make new predictions

In [20]:
with open('random_forest_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

new_predictions = loaded_model.predict(X_test_scaled)

[1 0 0 ... 2 0 0]
