In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [101]:
import pandas as pd
import numpy as np
import joblib

from imblearn.combine import SMOTEENN

In [50]:
data="task-2/data/"
train_data=pd.read_csv(data+"cleaned_data.csv")

### Virus presence is target, rest all parameters are features

In [51]:
y = train_data["WnvPresent"]
x = train_data.drop(['WnvPresent'], axis = 1)
y.value_counts()

0    19910
1     1102
Name: WnvPresent, dtype: int64

### Oversample and clean using SMOTEENN

In [52]:
smoteenn=SMOTEENN()
x_aug ,y_aug = smoteenn.fit_resample(x, y)
y_aug.value_counts()

0    18425
1    18080
Name: WnvPresent, dtype: int64

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
x_train_aug, x_test_aug, y_train_aug, y_test_aug = train_test_split(x_aug, y_aug, test_size = 0.2, random_state = 42)

### Use grid search CV to find best parameters for Random Forest Classifier

In [70]:
parameters_rf = { 
    'n_estimators': [150,200,250,300],
    'max_depth' : [6,7,8],
    'criterion' :['gini', 'entropy']
}

In [71]:
CV_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=6), param_grid=parameters_rf, n_jobs=-1, cv= 4)
CV_rf.fit(x_train, y_train)

In [81]:
y_pred=CV_rf.predict(x_test)
score = classification_report(y_test, y_pred)

In [82]:
print(score)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3983
           1       1.00      0.01      0.02       220

    accuracy                           0.95      4203
   macro avg       0.97      0.50      0.50      4203
weighted avg       0.95      0.95      0.92      4203



In [78]:
CV_rf_aug = GridSearchCV(estimator=RandomForestClassifier(random_state=6), param_grid=parameters_rf, n_jobs=-1, cv= 4)
CV_rf_aug.fit(x_train_aug, y_train_aug)

In [79]:
CV_rf_aug.best_params_

{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 200}

In [83]:
y_pred_aug = CV_rf_aug.predict(x_test_aug)
score_aug = classification_report(y_test_aug, y_pred_aug)

In [84]:
print(score_aug)

              precision    recall  f1-score   support

           0       0.95      0.82      0.88      4031
           1       0.84      0.95      0.89      3885

    accuracy                           0.89      7916
   macro avg       0.89      0.89      0.89      7916
weighted avg       0.89      0.89      0.89      7916



In [99]:
roc_auc_score(y_test_aug,y_pred_aug)

0.8868686278510143

### Use grid search CV to find best parameters for SVM

In [90]:
parameters_svm = {'C': [10, 100, 1000],  
              'gamma': [0.01, 0.001, 0.0001]
              }

In [91]:
CV_svm = GridSearchCV(estimator=SVC(), param_grid=parameters_svm)
CV_svm.fit(x_train, y_train)

In [92]:
y_pred_svm=CV_svm.predict(x_test)
score_svm = classification_report(y_test, y_pred)
print(score_svm)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3983
           1       1.00      0.01      0.02       220

    accuracy                           0.95      4203
   macro avg       0.97      0.50      0.50      4203
weighted avg       0.95      0.95      0.92      4203



In [95]:
CV_svm_aug = GridSearchCV(estimator=SVC(), n_jobs=-1,param_grid=parameters_svm)
CV_svm_aug.fit(x_train_aug, y_train_aug)

In [103]:
CV_svm_aug.best_params_

{'C': 100, 'gamma': 0.01}

In [96]:
y_pred_svm_aug=CV_svm_aug.predict(x_test_aug)
score_svm_aug = classification_report(y_test_aug, y_pred_aug)
print(score_svm_aug)

              precision    recall  f1-score   support

           0       0.95      0.82      0.88      4031
           1       0.84      0.95      0.89      3885

    accuracy                           0.89      7916
   macro avg       0.89      0.89      0.89      7916
weighted avg       0.89      0.89      0.89      7916



In [100]:
roc_auc_score(y_test_aug,y_pred_svm_aug)

0.9782494547565249

In [108]:
filename = 'task-2/saved_models/rf_aug_model.sav'
joblib.dump(CV_rf_aug, filename)

['task-2/saved_models/rf_aug_model.sav']