In [70]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from myfunctions import set_importer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)

sns.set_style("whitegrid")
%matplotlib inline

# setting font size for all plots
font = {'size'   : 16}

plt.rc('font', **font)

In [80]:
# import X_train_encoded, X_test_encoded, y_train and y_test
X_train = set_importer('../analysis-dfs/X_train_prepared.csv')
X_test = set_importer('../analysis-dfs/X_test_prepared.csv')

y_train = set_importer('../analysis-dfs/y_train_prepared.csv', y=True)
y_test = set_importer('../analysis-dfs/y_test_prepared.csv', y=True)

display(X_train.shape)
X_train.head(10)

(29852, 47)

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin_lake nyasa,basin_lake rukwa,basin_lake tanganyika,basin_lake victoria,basin_pangani,basin_rufiji,basin_ruvuma / southern coast,basin_wami / ruvu,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,quantity_group_enough,quantity_group_insufficient,quantity_group_seasonal,quantity_group_unknown,source_class_surface,source_class_unknown,waterpoint_type_group_communal standpipe,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.368039,1.842721,-1.698549,-0.386675,-0.148733,1.302765,0.69453,0.835788,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,-0.896969,-0.232595,-1.09746,-0.217496,0.1674,-0.687158,0.69453,-1.180012,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0
2,-0.896969,-0.150773,-1.068236,-0.217496,0.1674,-0.687158,0.69453,-1.180012,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
3,-0.705621,1.21568,-0.570298,-0.612247,-0.359489,0.970002,0.69453,0.851044,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
4,0.983844,-0.420747,1.107311,0.233648,-0.148733,1.302765,-1.439823,0.862232,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
5,0.923173,-0.683206,1.171351,0.177255,-0.464866,2.633817,0.69453,0.850027,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
6,-0.896969,-1.234213,1.325048,0.120862,-0.359489,-0.687158,0.69453,-1.180012,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
7,-0.896969,0.044576,1.812261,0.064469,-0.464866,-0.687158,-1.439823,-1.180012,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0
8,0.725602,0.150197,-1.762897,-0.330282,-0.254111,-0.36105,0.69453,0.843925,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
9,2.396399,-0.289534,-1.153023,-0.273889,-0.254111,-0.687158,-1.439823,0.827652,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0


In [84]:
vanilla_logreg = LogisticRegression(solver='saga') # using saga because this is a very big data set
vanilla_logreg.fit(X_train, y_train)

y_pred = vanilla_logreg.predict(X_test)

In [85]:
train_accuracy_score = accuracy_score(y_test, y_pred)
train_accuracy_score

0.7387883556254917

# Feature Selection

Using RFECV which perfoms RFE in a cross-validation loop to find the best features.

In [None]:
logreg = LogisticRegression()

rfecv = RFE(logreg)
rfecv.fit(X_train, y_train)

print(f"Selected features using RFECV are: {rfecv.n_features_}")