In [17]:
import csv
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [18]:
df_train = pd.read_csv(r'..\train_dump_v2.csv')

In [19]:
with open(r"..\PickleFiles\trained_ohe.pkl", "rb") as ohe_file:
    trained_ohe = pickle.load(ohe_file)

ohetransform = trained_ohe.transform(df_train[['geo_loc', 'tld']]).astype('int8')
df_train = pd.concat([df_train,ohetransform], axis=1).drop(columns=['geo_loc', 'tld'])

In [20]:
def transformIP(ip_add):
    if ip_add == '0':
        return 0
    else:
        parts = ip_add.split('.')
        return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])

df_train['ip_add'] = df_train.ip_add.apply(transformIP)

In [21]:
column = df_train.pop('label')
df_train[column.name] = column

In [22]:
df_test = pd.read_csv(r'..\newTest.csv')

In [23]:
df_test.ip_add.fillna('0', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test.ip_add.fillna('0', inplace=True)


In [24]:
ohetransform = trained_ohe.transform(df_test[['geo_loc', 'tld']]).astype('int8')
df_test = pd.concat([df_test,ohetransform], axis=1).drop(columns=['geo_loc', 'tld'])

In [25]:
df_test['ip_add'] = df_test.ip_add.apply(transformIP)

In [26]:
column = df_test.pop('label')
df_test[column.name] = column

In [27]:
# remove the url column thats read from the csv
df_train = df_train.iloc[:, 1:]
df_test = df_test.iloc[:, 1:]

In [28]:
x_train = df_train[df_train.columns[:-1]]
y_train = df_train['label']

x_test = df_test[df_test.columns[:-1]]
y_test = df_test['label']

In [44]:
model = RandomForestClassifier(class_weight={0: 20, 1: 1}, max_depth=30)
model.fit(x_train, y_train)

In [45]:
Imp = model.feature_importances_
feature_names = x_train.columns

sorted_idx = Imp.argsort()[::-1]
for f in range(x_train.shape[1]):
    print("%d. %s (%f)" % (f + 1, feature_names[sorted_idx[f]], Imp[sorted_idx[f]]))

1. js_len (0.532317)
2. who_is (0.318527)
3. tld_org (0.021814)
4. tld_edu (0.019246)
5. url_numOf_digits (0.017722)
6. tld_de (0.016752)
7. url_len (0.012156)
8. tld_com (0.009366)
9. url_entropy (0.008028)
10. ip_add (0.004127)
11. tld_jp (0.003846)
12. tld_co.uk (0.002484)
13. tld_ca (0.002039)
14. tld_net (0.001961)
15. tld_co.jp (0.001942)
16. tld_pl (0.001895)
17. tld_gov (0.001643)
18. tld_org.uk (0.001459)
19. tld_ru (0.001450)
20. tld_ac.uk (0.001413)
21. tld_tv (0.001384)
22. tld_dk (0.001161)
23. tld_fr (0.001038)
24. tld_at (0.000956)
25. tld_com.br (0.000933)
26. tld_ne.jp (0.000752)
27. tld_biz (0.000714)
28. tld_ws (0.000692)
29. tld_ch (0.000597)
30. tld_nl (0.000519)
31. tld_ie (0.000428)
32. tld_com.au (0.000369)
33. tld_edu.au (0.000326)
34. tld_cz (0.000310)
35. tld_org.au (0.000280)
36. geo_loc_United States (0.000247)
37. tld_net.au (0.000246)
38. tld_blogspot.com (0.000219)
39. tld_gr.jp (0.000208)
40. geo_loc_China (0.000203)
41. geo_loc_Japan (0.000195)
42. tld

In [46]:
y_pred = model.predict(x_test)

In [47]:
classification = classification_report(y_test, y_pred, zero_division=1)
print("\nClassification Report:\n", classification)


Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.55      0.54        20
           1       0.53      0.50      0.51        20

    accuracy                           0.53        40
   macro avg       0.53      0.53      0.52        40
weighted avg       0.53      0.53      0.52        40



In [48]:
with open(r'..\PickleFiles\Model.pkl', 'wb') as model_pkl:
    pickle.dump(model, model_pkl)