In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as impipe
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score, plot_roc_curve, recall_score

# Generate and plot a synthetic imbalanced classification dataset
from collections import Counter
from numpy import where

# Do not display future warnings
import warnings
warnings.simplefilter(action='ignore')

from IPython.display import display
pd.set_option('display.max_colwidth', None)

In [2]:
print(imblearn.__version__)

0.8.1


In [6]:
df = pd.read_csv('asset/full_train.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,species,trap,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent,Station,Tmax,...,Cool,Sunrise,Sunset,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,CULEX PIPIENS/RESTUANS,T048,41.867108,-87.654224,8,1,0,1.0,88.0,...,9.0,421.0,1917.0,0.0,0.0,29.39,30.11,5.8,18.0,6.5
1,2007-05-29,CULEX RESTUANS,T048,41.867108,-87.654224,8,2,0,1.0,88.0,...,9.0,421.0,1917.0,0.0,0.0,29.39,30.11,5.8,18.0,6.5
2,2007-05-29,CULEX RESTUANS,T091,41.862292,-87.64886,8,1,0,1.0,88.0,...,9.0,421.0,1917.0,0.0,0.0,29.39,30.11,5.8,18.0,6.5
3,2007-05-29,CULEX RESTUANS,T049,41.896282,-87.655232,8,1,0,1.0,88.0,...,9.0,421.0,1917.0,0.0,0.0,29.39,30.11,5.8,18.0,6.5
4,2007-05-29,CULEX RESTUANS,T153,41.907645,-87.760886,8,1,0,1.0,88.0,...,9.0,421.0,1917.0,0.0,0.0,29.39,30.11,5.8,18.0,6.5


In [None]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df['wnvpresent'].value_counts(normalize = True)  #data is heavily imbalance

### SMOTE

In [None]:
x = df.drop(columns = 'wnvpresent')
y = df['wnvpresent']

In [None]:
print(x.shape)
print(y.shape)

In [None]:
#counter = Counter(y)
#print(counter)

In [None]:
#for label, _ in counter.items():
#    row_ix = where(x == label)[0]
#    plt.scatter(y[row_ix:, 0], y[row_ix:, 1], label=str(label))
#plt.legend()
#plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

In [None]:
#sc = MinMaxScaler()
#z_train = sc.fit_transform(x_train)
#z_test = sc.transform(x_test)

In [None]:
print(f'z_train shape is: {x_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'z_test shape is: {x_test.shape}')
print(f'y_test shape is: {y_test.shape}')

In [None]:
#counter0 = Counter(y_train)
#print(counter)

In [None]:
#for label, _ in counter0.items():
#    row_ix = where(y_train == label)[0]
#    plt.scatter(z_train[:, 0], z_train[:,1], label=str(label))
#plt.legend()
#plt.show()

counter = Counter(y_train)
print('Before', counter)

smt= SMOTE()
z_train_sm, y_train_sm = smt.fit_resample(z_train,y_train)

counter = Counter(y_train_sm)
print('After', counter)

In [None]:
#for label, _ in counter.items():
#    row_ix = where(y_train_sm == label)[0]
#    plt.scatter(z_train_sm[row_ix, 0], z_train_sm[row_ix,1], label = str(label))
#plt.legend()
#plt.show()

In [None]:
df['wnvpresent'].value_counts(normalize = True) 

In [None]:
def plot_model(model, x_test, y_test):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,7))
    
    #Plot ROC curve
    ax1.set_title('ROC Curve')
    plot_roc_curve(model, x_test, y_test, ax = ax1)
    ax1.plot([0, 1], [0, 1],label='baseline', linestyle='--')
    ax1.legend()

    #Plot confusion matrix
    ax2.set_title('Confusion Matrix')
    y_labeled = y_test.map({1:'WNV Present', 0:'WNV Not Present'})
    y_pred = pd.Series(model.predict(x_test)).map({1:'WNV Present', 0:'WNV Not Present'})
    cm = confusion_matrix(y_labeled, y_pred)
    sns.heatmap(cm, annot=True, fmt='g', ax=ax2, cmap='Blues')
    ax2.set_xlabel('Predicted labels')
    ax2.set_ylabel('True labels')
    ax2.xaxis.set_ticklabels(['WNV Not Present', 'WNV Present']) 
    ax2.yaxis.set_ticklabels(['WNV Not Present', 'WNV Present'])
    plt.show();

In [None]:
def recall(model, X_train, y_train, X_test, y_test):
    print('Train Recall: ', round(recall_score(y_train, model.predict(X_train)),5))
    print('Test Recall: ', round(recall_score(y_test, model.predict(X_test)),5))
    print('Cross Val Recall:', round(cross_val_score(model, X_test,y_test, scoring = 'recall').mean(),5))

In [None]:
def run_model(name, gs_model):
        x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)
        print('x_train',x_train.shape)
        print('x_test',x_test.shape)
        
        #scale
        ss = StandardScaler()
        xs_train = ss.fit_transform(x_train)
        xs_test = ss.transform(x_test)
        
        #smote
        smt=SMOTE(random_state=42)
        xsm_train,ysm_train=smt.fit_resample(xs_train,y_train)
        
        #fit model
        gs_model.fit(xs_train,y_train)
        best_model = gs_model.best_estimator_
        
        #print best parameters
        print('best parameters: ' ,gs_model.best_params_)
        best_model.fit(xsm_train, ysm_train)
        recall(best_model, xsm_train, ysm_train, xs_test, y_test)
        roc_auc_score(y_test, gs_model(xs_test)[:, 1])
        plot_model(best_model, xs_test, y_test)
        model_name = name + ' '
        #add_model(model_name, best_model, xs_test, ytest)

In [None]:
#set pipeline
lr_pipe = impipe([
        ('sampling', SMOTE(random_state = 10)),
        ('lr', LogisticRegression(random_state = 42))])

In [None]:
#Set parameter
params_logreg = {
            #'logreg__penalty': ['l1', 'l2'],
            'lr__C': [0.02, 0.003, 0.01],
            'lr__solver' : ['newton-cg','liblinear','saga', 'sag', 'lbfgs'],
            'lr__multi_class': ['multinomial', 'ovr', 'auto']
}

In [None]:
#Instatiate 5-fold GridSearchCV.
gs_lr = GridSearchCV(lr_pipe, param_grid = params_logreg, cv=5,
                     scoring='roc_auc',n_jobs=-1,
                     verbose=1)

In [None]:
run_model('log_reg', gs_lr)

In [None]:
gs_lr.best_estimator_.named_steps.lr.coef_

In [None]:
y1 = gs_lr.best_estimator_.named_steps.lr.coef_

In [None]:
x1 = x_train.columns

In [None]:
y1.shape

In [None]:
y1 = np.reshape(y1,-1)

In [None]:
lr_coefficient = sorted(zip(map(lambda x: round(x, 4), y1), 
                 x.columns), reverse=True)[:10]

print(lr_coefficient)

In [None]:
lr_feature = pd.DataFrame(data = lr_coefficient, columns = ['Values','features'])
lr_feature

In [None]:
y1.shape

In [None]:
sns.barplot(x = lr_feature['Values'], y= lr_feature['features'])