In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
# !pip install pandas-profiling
# import pandas_profiling as pdp

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LassoCV, Lasso
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.svm import SVC

!pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier

#!pip install imblearn
import imblearn
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

from math import radians, cos, sin, asin, sqrt, atan2



In [2]:
#load the datasets

spray = pd.read_csv('../assets/clean_data/spray_clean(LM).csv')
weather = pd.read_csv('../assets/clean_data/weather_clean(LM).csv')
train = pd.read_csv('../assets/clean_data/train_clean(LM).csv')
test = pd.read_csv('../assets/clean_data/test_clean(LM).csv')
merged = pd.read_csv('../assets/clean_data/mergeddf(LM).csv')

In [3]:
col_drop = ['date', 
            'address', 
            'species', 
            'block', 
            'street', 
            'trap', 
            'addressnumberandstreet', 
            'wnvpresent', 
            'nummosquitos', 
            'codesum', 
            'addressaccuracy', 
            'resultdir',
            'wetbulb', 
            'cool',
            'resultspeed',
            'stnpressure',
           'tmax',
           'tmin',
           'station',
            'year']

In [4]:
X = merged.drop(columns = col_drop)

In [5]:
y = merged['wnvpresent']

In [6]:
X.columns

Index(['latitude', 'longitude', 'month', 'week', 'dayofweek', 'tavg',
       'dewpoint', 'heat', 'sunrise', 'sunset', 'preciptotal', 'sealevel',
       'avgspeed'],
      dtype='object')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42, test_size = 0.3)

In [8]:
def score_model(model,X_test,y_test):
    preds = model.predict_proba(X_test)
    pred_list =[]
    
    for x in preds:
        pred_list.append(x[1])
        
    roc_score = roc_auc_score(y_test, pred_list)
    return roc_score

In [9]:
lr = LogisticRegression()

In [10]:
ss = StandardScaler()

ss.fit(X_train) 

X_train_ss = ss.transform(X_train)

X_test_ss = ss.transform(X_test)

In [11]:
lr.fit(X_train_ss, y_train)

LogisticRegression()

In [12]:
y_pred = lr.predict(X_test_ss)

In [13]:
print('Log reg AUC train score', score_model(lr, X_train_ss, y_train) )

Log reg AUC train score 0.7521683896968635


In [14]:
print('Log reg AUC test score', score_model(lr, X_test_ss, y_test) )

Log reg AUC test score 0.744779201556541


In [15]:
coefs=lr.coef_[0]

In [16]:
coef = pd.DataFrame(lr.coef_[0], index = X.columns, columns = ['Coefficients']).sort_values(by = ['Coefficients'], ascending = False)

coef

Unnamed: 0,Coefficients
sunset,1.933358
week,1.93266
sunrise,1.002423
dewpoint,0.478473
dayofweek,0.267039
sealevel,0.232647
tavg,0.170429
heat,0.075597
preciptotal,0.059039
avgspeed,0.020241


In [17]:
RF = RandomForestClassifier(n_estimators = 1000, 
                            bootstrap=True, 
                            max_depth=5,
                            max_features='auto',
                            min_samples_leaf= 1, 
                            min_samples_split= 2)

In [18]:
RF_model= RF.fit(X_train_ss, y_train)
print('RF AUC score', score_model(RF_model,X_test_ss,y_test) )


RF AUC score 0.839101228879565


In [19]:
model = XGBClassifier()
xgm = model.fit(X_train_ss, y_train)
score_model(xgm,X_test_ss,y_test)






0.8407350597728454

In [21]:
poly = PolynomialFeatures(include_bias=False, degree=2)
X_poly = poly.fit_transform(X)
X_poly.shape

(8610, 104)

In [23]:
# Adds appropriate feature names to all polynomial features
X_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(X.columns))

# Generates list of poly feature correlations
X_poly_corrs = X_poly.corrwith(y)

# Shows features most highly correlated (positively) with target
X_poly_corrs.sort_values(ascending=False).head(10)

week dewpoint       0.142365
dewpoint sunrise    0.141852
month dewpoint      0.140553
tavg sunrise        0.130722
week tavg           0.129006
month tavg          0.125870
sunrise sunset      0.120361
week sunset         0.113828
month sunset        0.108562
latitude week       0.099821
dtype: float64

In [22]:
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, stratify = y, random_state = 42, test_size = 0.3)

In [24]:
ss = StandardScaler()

ss.fit(X_train_poly) 

X_train_poly_ss = ss.transform(X_train_poly)

X_test_poly_ss = ss.transform(X_test_poly)

In [25]:
lr_poly = LogisticRegression()

In [26]:
lr_poly.fit(X_train_poly_ss, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [27]:
print('Log reg poly AUC train score', score_model(lr_poly, X_train_poly_ss, y_train) )

Log reg poly AUC train score 0.8195302917469774


In [28]:
print('Log reg poly AUC test score', score_model(lr_poly, X_test_poly_ss, y_test) )

Log reg poly AUC test score 0.8219497347076413


In [29]:
coefpoly = pd.DataFrame(lr_poly.coef_[0], index = X_poly.columns, columns = ['Coefficients']).sort_values(by = ['Coefficients'], ascending = False)

coefpoly

Unnamed: 0,Coefficients
week sunset,1.512644
sunrise sunset,1.484922
month sunset,1.431406
tavg avgspeed,1.32201
tavg preciptotal,1.127289
dayofweek avgspeed,0.965236
dewpoint preciptotal,0.831405
month dayofweek,0.706353
week dewpoint,0.700615
dewpoint sunrise,0.591295
