In [62]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output, display_html 
from collections import Counter

In [63]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [64]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group'].map({'functional': 2, 'functional needs repair': 0, 'non functional': 1})

In [65]:
#Dropping unecessary columns
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# Turning certain dtypes into others
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

#Defining X_cat
X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_cat = pd.get_dummies(X_cat)

#Defining X_numeric
X_numeric = X.select_dtypes(['float64', 'int64'])
scaler = StandardScaler()
scaler.fit(X_numeric)
X_numeric = pd.DataFrame(scaler.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

#Defining X
X = pd.concat([X_numeric, X_cat], axis=1)

#Defining df
df = pd.concat([X, y], axis=1)

#Creating a train-test-split for X and y
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Resampling dataframes for model creation
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train_resampled = X_train_resampled.reset_index(drop=True)
y_train_resampled = y_train_resampled.reset_index(drop=True)
    
df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)

#Separating concatenated dataframe into each status group
f = df_resampled[df_resampled.status_group == 0].reset_index(drop=True).copy()
nf = df_resampled[df_resampled.status_group == 2].reset_index(drop=True).copy()
fnr = df_resampled[df_resampled.status_group == 1].reset_index(drop=True).copy()

#Shuffling all the records
f = f.sample(frac=1)
nf = nf.sample(frac=1)
fnr = fnr.sample(frac=1)

#Splitting each status group into arrays of approximately 1,000 records
fs = [pd.DataFrame(i) for i in np.array_split(f, len(f)//1000)]
nfs = [pd.DataFrame(i) for i in np.array_split(nf, len(nf)//1000)]
fnrs = [pd.DataFrame(i) for i in np.array_split(fnr, len(fnr)//1000)]

#Creating our individual samples for models to train on
resamples = []
for i in range(len(fnrs)):
    resample = pd.concat([fs[i], nfs[i], fnrs[i]])
    resamples.append(resample)

In [2]:
estimators = {'Logistic Regression': LogisticRegression(solver='liblinear', fit_intercept=False), 
              'Decision Tree Classifier': DecisionTreeClassifier(), 
              'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=3), 
              'Bagging Classifier': BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50), 
              'Random Forest': RandomForestClassifier(), 
              'XG-Boost': XGBClassifier(), 
              'Adaboost': AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42), 
              'Gradient Boosted Trees': GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50), 
              'Extra Randomized Trees': ExtraTreesClassifier(n_estimators=100, random_state=42)
            }

NameError: name 'LogisticRegression' is not defined

# Approach #1 – Single Model with Original Data

Here we use a single estimator on a training dataset and make predictions on a test dataset.

In [10]:
ap1_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train, y_train)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap1_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Logistic Regression completed.
Estimator Decision Tree Classifier completed.
Estimator K-Nearest Neighbors completed.
Estimator Bagging Classifier completed.
Estimator Random Forest completed.
Estimator XG-Boost completed.
Estimator Adaboost completed.
Estimator Gradient Boosted Trees completed.
Estimator Extra Randomized Trees completed.
Estimator Stacking Classifier completed.


# Approach #2 – Single Model with Resampled Data

Here we use a single estimator on a resampled training dataset and make predictions on a test dataset.

In [28]:
smote = SMOTE()

In [11]:
ap2_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap2_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Logistic Regression completed.
Estimator Decision Tree Classifier completed.
Estimator K-Nearest Neighbors completed.
Estimator Bagging Classifier completed.
Estimator Random Forest completed.
Estimator XG-Boost completed.
Estimator Adaboost completed.
Estimator Gradient Boosted Trees completed.
Estimator Extra Randomized Trees completed.
Estimator Stacking Classifier completed.


# Approach #3 – Single Model with Resampled Data (Different Ratio)

In [49]:
strategy = {0: 10000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [50]:
ap3_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap3_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Logistic Regression completed.
Estimator Decision Tree Classifier completed.
Estimator K-Nearest Neighbors completed.
Estimator Bagging Classifier completed.
Estimator Random Forest completed.
Estimator XG-Boost completed.
Estimator Adaboost completed.
Estimator Gradient Boosted Trees completed.
Estimator Extra Randomized Trees completed.
Estimator Stacking Classifier completed.


# Approach #4 – Single Model with Resampled Data (Different Ratio)

In [51]:
strategy = {0: 15000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [52]:
ap4_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap4_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Logistic Regression completed.
Estimator Decision Tree Classifier completed.
Estimator K-Nearest Neighbors completed.
Estimator Bagging Classifier completed.
Estimator Random Forest completed.
Estimator XG-Boost completed.
Estimator Adaboost completed.
Estimator Gradient Boosted Trees completed.
Estimator Extra Randomized Trees completed.
Estimator Stacking Classifier completed.


# Approach #5 – Single Model with Resampled Data (Different Ratio)

In [53]:
strategy = {0: 20000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [54]:
ap5_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap5_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Logistic Regression completed.
Estimator Decision Tree Classifier completed.
Estimator K-Nearest Neighbors completed.
Estimator Bagging Classifier completed.
Estimator Random Forest completed.
Estimator XG-Boost completed.
Estimator Adaboost completed.
Estimator Gradient Boosted Trees completed.
Estimator Extra Randomized Trees completed.
Estimator Stacking Classifier completed.


# Approach #6 – Voting Classifiers on Different Data Samples

After resampling the data, we split it into multiple samples with equal amounts of each status group. An estimator is trained on each sample and makes different predictions for that dataset based on each sample. A mock voting classifier is made from the resulting model predictions.

In [12]:
ap6_dict = {}

In [13]:
for name, est in estimators.items():

    preds = []
    modes=[]
    counters=[]
    
    for i in range(len(resamples)):
        estimator =  est
        X = resamples[i].drop(['status_group'], axis=1)
        y = resamples[i]['status_group']
        estimator.fit(X, y)
        pred = estimator.predict(X_test)
        preds.append(pred)

        clear_output(wait=True)
        print(f"Sample: #{i}")
        print(f"Estimator: {name}")
        clear_output(wait=True)


    for i in range(len(preds[0])):
        counter = Counter([pred[i] for pred in preds])
        counters.append(counter)

    counter_modes = []
    for counter in counters:
        counter_mode = sorted(counter.items(), key=lambda x: [x[1], x[0]], reverse=True)[0][0]
        counter_modes.append(counter_mode)

    matrix = pd.DataFrame(confusion_matrix(y_test, counter_modes))
    report = pd.DataFrame(classification_report(y_test, counter_modes, output_dict=True))
    
    ap6_dict[name] = [preds, counter_modes, matrix, report]

Sample: #23
Estimator: Stacking Classifier


# Looking at All of our Approaches

In [1]:
master_dict = {}
for name in estimators.keys():
    ap1 = ap1_dict[name][2]
    ap2 = ap2_dict[name][2]
    ap3 = ap3_dict[name][2]
    ap4 = ap4_dict[name][2]
    ap5 = ap5_dict[name][2]
#     ap6 = ap6_dict[name][3]
    master_dict[name]=[ap1,ap2,ap3,ap4,ap5]
    
print("\n")
for name, results in master_dict.items():
    print(color.BOLD + name + color.END)
    display(results[0].style.set_caption("original data"))
    display(results[1].style.set_caption("resampled data"))
    display(results[2].style.set_caption("resampled data (n = 10000)"))
    display(results[3].style.set_caption("resampled data (n = 15000)"))
    display(results[4].style.set_caption("resampled data (n = 20000)"))
#     display(results[5].style.set_caption("resampled samples"))
    print("\n\n")

NameError: name 'estimators' is not defined