## Model Training

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.linear_model import LogisticRegressionCV ,SGDClassifier
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix,precision_score
from sklearn.model_selection import RandomizedSearchCV ,StratifiedKFold,StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from datetime import datetime

In [6]:
train_data = pd.read_csv(r'/content/drive/MyDrive/data/train_data.csv')
test_data = pd.read_csv(r'/content/drive/MyDrive/data/test_data.csv')
y_test=test_data
x_test = y_test['IncidentGrade']
y_test.drop(columns=['IncidentGrade'],axis=1,inplace=True)
y_train = train_data['IncidentGrade']
train_data.drop(columns=['IncidentGrade'],axis=1,inplace=True)
x_train = train_data

In [7]:
x_train = x_train.astype(np.float32)    #DownSampling

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.8, random_state=42)

for train_index, test_index in sss.split(x_train, y_train):
    X_sample, y_sample = x_train.iloc[train_index], y_train.iloc[train_index]

In [8]:
X_sample.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

X_sample

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sample.drop(columns=['Unnamed: 0'],axis=1,inplace=True)


Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,...,ApplicationName,OAuthApplicationId,FileName,FolderPath,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City
2407538,0.829383,-0.288485,-0.538143,-0.849940,-0.329841,-0.252060,-0.257062,0.293712,1.246643,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
7741501,0.915931,1.280889,-0.314734,0.860782,-1.225249,5.569540,0.418417,0.817514,-1.075726,0.909463,...,0.153251,0.015841,-3.131810,-3.163470,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
3030199,-1.334329,-0.443613,-0.584379,-0.884586,1.460974,-0.226779,-0.252612,0.031811,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
393707,-1.680523,-0.384148,0.098939,-0.543027,-1.225249,-0.252060,-0.257062,0.293712,1.246643,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
3882447,1.504461,-0.355707,-0.572487,-0.854817,-0.329841,-0.139444,-0.254008,-0.491992,0.085459,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2471046,1.712177,1.477384,1.284505,0.046695,-0.329841,0.989023,-0.214834,0.293712,-0.959607,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
9449036,-0.866968,-0.311755,0.429170,0.381953,-1.225249,-0.242867,-0.254444,1.865119,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
4172426,-1.265091,-0.469468,-0.584064,-0.209171,1.460974,-0.254359,-0.257149,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
5601540,1.660248,-0.244533,-0.577907,0.813508,-1.225249,-0.249762,-0.256975,-2.063399,0.782169,0.909463,...,0.153251,0.015841,-3.216761,-3.341250,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155


In [7]:
y_sample

Unnamed: 0,IncidentGrade
2407538,0
7741501,2
3030199,2
393707,0
3882447,0
...,...
2471046,2
9449036,0
4172426,2
5601540,1


In [9]:
y_sample.value_counts() / len(y_sample) * 100

Unnamed: 0_level_0,count
IncidentGrade,Unnamed: 1_level_1
0,43.734655
2,34.914076
1,21.351269


In [8]:
model  = LogisticRegressionCV(multi_class='ovr')
model

In [9]:
kfold = StratifiedKFold(n_splits=5 , random_state=42 ,shuffle=True)


In [10]:
params ={

    # Regularization strength
    'penalty': ['elasticnet'],       # Regularization type
    'max_iter': [100,200,500],
        'solver':['saga']  ,
          'l1_ratios': [[0.1], [0.5], [0.9]]   # Maximum iterations

}

In [11]:
grid =RandomizedSearchCV(param_distributions=params , cv=kfold ,estimator=model,n_jobs=1,error_score="raise")
grid

In [11]:
X_sample =X_sample.sample(frac=0.2, random_state=42)

In [12]:
y_sample =y_sample.sample(frac=0.2, random_state=42)

In [11]:
X_sample

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,...,ApplicationName,OAuthApplicationId,FileName,FolderPath,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City
3238050,0.431260,-0.068721,-0.580737,1.806842,0.565566,-0.254359,-0.257149,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
3170122,-1.697833,8.419344,-0.407545,1.422037,0.565566,-0.210691,-0.255840,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.195587,-3.679656,-3.788222
2405068,0.119685,-0.469468,-0.583096,-0.170914,0.565566,-0.238271,-0.256626,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
1059080,1.123648,2.754601,0.361210,-0.122552,-0.329841,0.230583,-0.240572,-0.753893,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
2214276,-0.797729,-0.353122,0.310630,-0.052325,0.565566,-0.245165,-0.256887,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.529997,0.266429,0.266155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6134966,-0.070722,0.960292,-0.474851,-0.738804,-0.329841,-0.252060,-0.257062,0.293712,0.201577,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
6426667,1.642938,-0.469468,-0.583833,0.199024,-1.225249,-0.254359,-0.257149,0.293712,-1.656318,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
8941647,0.846692,-0.469468,-0.584188,1.277660,0.565566,-0.254359,-0.257149,0.293712,-1.656318,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
1209280,-0.503464,-0.461711,0.169886,-0.260645,1.460974,-0.249762,-0.256975,-2.063399,1.246643,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155


In [15]:
#sgd_clf = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42,class_weight='balanced')
#sgd_clf.fit(X_sample, y_sample)

In [13]:
y_test.drop(columns=['Usage'],axis=1,inplace=True)
y_test

Unnamed: 0.1,Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,EntityType,...,ApplicationName,OAuthApplicationId,FileName,FolderPath,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City
0,0,0.811065,1.297831,-0.475394,-0.703252,-0.321938,1.036414,-0.188338,0.541679,1.434189,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
1,1,1.125117,-0.465577,0.198132,0.494561,-1.222142,-0.242079,-0.242067,-2.082191,-0.256344,...,0.153044,0.015194,0.337131,0.314873,0.02729,-6.998030,-6.976374,0.304288,0.276908,0.276735
2,2,0.880854,-0.082696,-0.301649,-0.605418,1.478471,6.934134,0.792758,0.541679,0.783984,...,0.153044,0.015194,-3.190448,-3.293518,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
3,3,-1.596668,0.124922,-0.445473,1.121763,-1.222142,-0.246977,-0.242259,0.279292,-1.296673,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
4,4,-0.671959,0.505106,-0.510619,-0.881811,-1.222142,-0.180848,-0.240535,-1.032643,1.434189,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4147987,4147987,-0.619617,0.232776,-0.549626,-0.395693,1.478471,0.093465,-0.230766,0.279292,1.434189,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
4147988,4147988,-0.828985,3.093594,-0.312220,-0.225027,-0.321938,0.289402,-0.223487,0.279292,1.434189,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
4147989,4147989,0.915749,-0.346938,-0.544231,-0.878957,1.478471,-0.107371,-0.239481,-2.082191,-0.646468,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735
4147990,4147990,-0.445144,-0.284922,-0.539226,-0.852808,-1.222142,-0.244528,-0.242163,0.279292,0.133779,...,0.153044,0.015194,0.337131,0.314873,0.02729,0.143503,0.143616,0.304288,0.276908,0.276735


In [14]:
#x_train = x_train.astype(np.float32)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.8, random_state=42)

for train_index, test_index in sss.split(x_train, y_train):
    x_test, y_test = x_train.iloc[train_index], y_train.iloc[train_index]

In [14]:
y_test

Unnamed: 0,IncidentGrade
2407538,0
7741501,2
3030199,2
393707,0
3882447,0
...,...
2471046,2
9449036,0
4172426,2
5601540,1


In [15]:
x_test_sample =x_test.sample(frac=0.25, random_state=42)
y_test_sample =y_test.sample(frac=0.25, random_state=42)

In [16]:
y_test_sample

Unnamed: 0,IncidentGrade
3238050,2
3170122,1
2405068,2
1059080,1
2214276,0
...,...
6683008,1
9338683,0
4273247,2
9038101,1


In [16]:
x_test_sample
x_test_sample.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
x_test_sample

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,...,ApplicationName,OAuthApplicationId,FileName,FolderPath,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City
3238050,0.431260,-0.068721,-0.580737,1.806842,0.565566,-0.254359,-0.257149,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
3170122,-1.697833,8.419344,-0.407545,1.422037,0.565566,-0.210691,-0.255840,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.195587,-3.679656,-3.788222
2405068,0.119685,-0.469468,-0.583096,-0.170914,0.565566,-0.238271,-0.256626,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
1059080,1.123648,2.754601,0.361210,-0.122552,-0.329841,0.230583,-0.240572,-0.753893,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
2214276,-0.797729,-0.353122,0.310630,-0.052325,0.565566,-0.245165,-0.256887,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.529997,0.266429,0.266155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6683008,-1.680523,-0.373806,-0.583212,-0.865846,0.565566,-0.031423,0.006603,-0.491992,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
9338683,1.383293,-0.249704,2.522248,0.056671,0.565566,-0.095776,4.202475,-0.491992,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
4273247,1.262125,0.712088,-0.508723,-0.756116,-1.225249,-0.146338,-0.254532,0.293712,0.085459,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
9038101,0.846692,0.083821,-0.450090,-0.823286,1.460974,-0.240569,-0.256713,0.293712,0.085459,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155


In [22]:
#pred = sgd_clf.predict(x_test_sample)

In [23]:
#acc =accuracy_score(y_test_sample , pred)
#print(acc)

In [24]:
#con =confusion_matrix(y_test_sample , pred)
#print(con)

In [25]:
#pre = precision_score(y_test_sample , pred,average='micro')
#print(pre)

In [26]:
#pred

In [18]:
y_test_sample

Unnamed: 0,IncidentGrade
3238050,2
3170122,1
2405068,2
1059080,1
2214276,0
...,...
6683008,1
9338683,0
4273247,2
9038101,1


In [17]:
#hyper parameter tuning
from scipy.stats import uniform

sgd = SGDClassifier(random_state=42)


In [29]:
param_distributions = {
    'loss': ['hinge', 'log_loss', 'squared_hinge', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': uniform(1e-4, 1e-3),  # regularization strength
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': uniform(0.001, 0.1),  # initial learning rate for 'constant', 'invscaling', and 'adaptive'
    'max_iter': [500,1000,2000],  # number of iterations
    'tol': [1e-3, 1e-4, 1e-5]  ,
    'class_weight': ['balanced'],
    'early_stopping': [True, False],
    'validation_fraction': [0.1, 0.2, 0.3],

    # tolerance for stopping criteria
}

# Initialize RandomizedSearchCV


In [30]:
random_search = RandomizedSearchCV(
    sgd,
    param_distributions=param_distributions,
    n_iter=15,  # number of parameter settings to sample
    scoring='accuracy',  # scoring metric
    n_jobs=-1,  # use all available cores
    cv=5,       # 3-fold cross-validation
    random_state=42,
    verbose=1,
    error_score='raise'
)

In [31]:
random_search

In [32]:
X_sample

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,...,ApplicationName,OAuthApplicationId,FileName,FolderPath,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City
3238050,0.431260,-0.068721,-0.580737,1.806842,0.565566,-0.254359,-0.257149,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
3170122,-1.697833,8.419344,-0.407545,1.422037,0.565566,-0.210691,-0.255840,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.195587,-3.679656,-3.788222
2405068,0.119685,-0.469468,-0.583096,-0.170914,0.565566,-0.238271,-0.256626,0.293712,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
1059080,1.123648,2.754601,0.361210,-0.122552,-0.329841,0.230583,-0.240572,-0.753893,1.362762,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
2214276,-0.797729,-0.353122,0.310630,-0.052325,0.565566,-0.245165,-0.256887,0.293712,-0.611252,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,-3.529997,0.266429,0.266155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6134966,-0.070722,0.960292,-0.474851,-0.738804,-0.329841,-0.252060,-0.257062,0.293712,0.201577,-1.099549,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
6426667,1.642938,-0.469468,-0.583833,0.199024,-1.225249,-0.254359,-0.257149,0.293712,-1.656318,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
8941647,0.846692,-0.469468,-0.584188,1.277660,0.565566,-0.254359,-0.257149,0.293712,-1.656318,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155
1209280,-0.503464,-0.461711,0.169886,-0.260645,1.460974,-0.249762,-0.256975,-2.063399,1.246643,0.909463,...,0.153251,0.015841,0.334982,0.312057,0.027967,0.144225,0.14433,0.291824,0.266429,0.266155


In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_sample_pca = pca.fit_transform(X_sample)
x_test_sample_pca = pca.transform(x_test_sample)

In [19]:
X_sample_pca=pd.DataFrame(X_sample_pca)
x_test_sample_pca=pd.DataFrame(x_test_sample_pca)
X_sample_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-4.126362,-1.286923,-0.256406,0.265437,1.172869,-0.166798,-0.446147,-0.481322,0.332310,0.159991
1,3.754377,-4.456260,-1.932801,0.348454,2.861685,-0.735802,2.087575,-0.743465,1.585493,4.343661
2,-3.999810,-1.167874,-0.076307,0.587774,0.465413,0.017320,-1.157351,0.062173,-0.049283,-0.371969
3,-4.008446,-0.859818,-0.086748,0.291836,0.826306,-0.236007,0.388034,-0.645175,0.209943,1.201560
4,2.101686,-1.461847,-0.484907,0.083200,0.033153,0.038395,-0.248673,-0.148900,0.368216,-0.237360
...,...,...,...,...,...,...,...,...,...,...
380668,-3.716493,-1.043907,-0.020804,0.316559,0.450692,0.042747,-0.995789,0.334075,0.369021,0.417676
380669,1.286681,0.246351,0.254474,-0.522494,-0.863280,0.217260,-0.245169,-0.002158,0.656319,0.403089
380670,1.307359,0.136672,0.130028,-0.730914,-0.518566,0.118284,-0.186186,-0.359122,1.106209,0.090367
380671,0.550725,0.439001,0.354396,0.477469,-2.150570,0.134780,0.416002,-3.368715,-2.547694,-0.900547


In [33]:
#random_search.fit(X_sample,y_sample)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [45]:
pca_model = random_search.fit(X_sample_pca,y_sample)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [46]:
#pca_model

In [37]:
predict = random_search.predict(x_test_sample)

In [38]:
acc =accuracy_score(y_test_sample , predict)
print(acc)

0.5895255147717099


In [39]:
pre = precision_score(y_test_sample , predict,average='micro')
print(pre)

0.5895255147717099


In [48]:
p=pca_model.predict(X_sample_pca)

In [50]:
a=accuracy_score(y_sample , p)
print(a)

0.5709257026371716


### Naive bayers

In [21]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [21]:
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler


In [23]:
nb = GaussianNB()

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)


In [40]:
param_dist = {
    'var_smoothing': np.logspace(-9, -4, 100)  # Explore a wide range for var_smoothing
}

# Set up RandomizedSearchCV
random_search =RandomizedSearchCV(estimator=nb, param_distributions=param_dist,
                                   n_iter=30, scoring='f1', cv=5, verbose=1, n_jobs=-1, random_state=42)


In [25]:
nb.fit(X_resampled, y_resampled)

In [41]:
random_search.fit(X_sample, y_sample)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


 nan nan nan nan nan nan nan nan nan nan nan nan]


In [28]:
val=nb.predict(x_test_sample_pca)
ac = accuracy_score(y_test_sample, val)
print(ac)

0.49660181320690483


In [45]:
val=random_search.predict(x_test_sample)
ac = accuracy_score(y_test_sample, val)
print(ac)

0.5285073616872827


In [44]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test_sample, val, average='weighted')
print(f1)

0.4843207934368134


### Decision Tree classifier

In [34]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import RandomizedSearchCV
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'max_depth': [int(x) for x in np.linspace(5, 50, num=10)],  # Range of tree depths
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy'],
      'max_features': ['sqrt', 'log2', None]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=dt_classifier, param_distributions=param_dist,
                                   n_iter=50, scoring='f1', cv=5, verbose=1, n_jobs=-1, random_state=42)

# Fit the model with hyperparameter tuning
random_search.fit(X_resampled, y_resampled)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [29]:
best_params = random_search.best_params_
print("Best hyperparameters found: ", best_params)

# Evaluate the model with the best parameters on the test set
best_dt_classifier = random_search.best_estimator_
y_pred = best_dt_classifier.predict(x_test_sample)
accuracy = accuracy_score(y_test_sample, y_pred)
print("Accuracy on test set: ", accuracy)

Best hyperparameters found:  {'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 50, 'criterion': 'entropy'}
Accuracy on test set:  0.8764779065320001


In [32]:

clf = classification_report(y_test_sample, y_pred)
print(clf)

              precision    recall  f1-score   support

           0       0.87      0.89      0.88    208142
           1       0.80      0.86      0.83    101667
           2       0.93      0.87      0.90    166033

    accuracy                           0.88    475842
   macro avg       0.87      0.87      0.87    475842
weighted avg       0.88      0.88      0.88    475842



In [33]:
import pickle
with open('best_decision_tree_model.pkl', 'wb') as f:
    pickle.dump(best_dt_classifier, f)

print("Model saved as 'best_decision_tree_model.pkl'")

Model saved as 'best_decision_tree_model.pkl'
