In [85]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output

In [86]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group']

In [87]:
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

# Defining the train and test sets

In [88]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

oe = OrdinalEncoder()
oe.fit(X_cat)
X_cat = pd.DataFrame(oe.transform(X_cat), index = X_cat.index, columns = X_cat.columns)

mms = MinMaxScaler()
mms.fit(X_numeric)
X_numeric = pd.DataFrame(mms.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

y = y.map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})

In [89]:
names = ['chi2', 'mi', 'forest']

functions = [SelectKBest(score_func=chi2, k='all').fit(X_cat, y), 
             SelectKBest(score_func=mutual_info_classif, k='all').fit(X_cat, y), 
             RandomForestClassifier(random_state=42, n_jobs=6, class_weight='balanced').fit(X_cat, y)]

function_results = {name: None for name in names}

for name, function in list(zip(names, functions)):
    try:
        function_results[name] = pd.DataFrame({'feature': function.feature_names_in_, 'score': function.scores_}).sort_values(by=['score'], ascending=False).reset_index()
    except:
        function_results[name] = pd.DataFrame({'feature': function.feature_names_in_, 'score': function.feature_importances_}).sort_values(by=['score'], ascending=False).reset_index()

In [90]:
cols = list(X_cat.columns)

rankings = {name: [] for name in names}

for name in names:
    df = function_results[name].sort_values(by=['score'], ascending=False).reset_index()
    for col in cols:
        rankings[name].append(df[df.feature == col].index[0])
        
        
rankings = pd.DataFrame(rankings)
rankings['feature'] = cols
rankings['average'] = rankings.apply(lambda row: (row.chi2 + row.mi + row.forest)/3, axis=1)
rankings.sort_values(by=['average'])

Unnamed: 0,chi2,mi,forest,feature,average
4,0,2,2,lga,1.333333
18,10,0,1,quantity_group,3.666667
17,11,1,0,quantity,4.0
22,3,4,9,waterpoint_type,5.333333
10,1,5,12,extraction_type_class,6.0
2,7,8,4,region_code,6.333333
1,6,9,5,region,6.666667
8,4,3,14,extraction_type,7.0
9,2,6,13,extraction_type_group,7.0
13,8,10,8,payment,8.666667


# ds;lkafjs;dkfjdsa

In [91]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group']

In [92]:
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

In [93]:
df = pd.concat([X, y], axis=1)

In [94]:
f_sample = df[df.status_group=='functional'].sample(n=1000)

In [95]:
fnr_sample = df[df.status_group=='functional needs repair']

In [96]:
df = pd.concat([f_sample, fnr_sample])

In [97]:
X = df.drop(['status_group'], axis=1)

In [98]:
y = df['status_group']

# Defining the train and test sets

In [99]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

oe = OrdinalEncoder()
oe.fit(X_cat)
X_cat = pd.DataFrame(oe.transform(X_cat), index = X_cat.index, columns = X_cat.columns)

mms = MinMaxScaler()
mms.fit(X_numeric)
X_numeric = pd.DataFrame(mms.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

y = y.map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})

In [100]:
names = ['chi2', 'mi', 'forest']

functions = [SelectKBest(score_func=chi2, k='all').fit(X_cat, y), 
             SelectKBest(score_func=mutual_info_classif, k='all').fit(X_cat, y), 
             RandomForestClassifier(random_state=42, n_jobs=6, class_weight='balanced').fit(X_cat, y)]

function_results = {name: None for name in names}

for name, function in list(zip(names, functions)):
    try:
        function_results[name] = pd.DataFrame({'feature': function.feature_names_in_, 'score': function.scores_}).sort_values(by=['score'], ascending=False).reset_index()
    except:
        function_results[name] = pd.DataFrame({'feature': function.feature_names_in_, 'score': function.feature_importances_}).sort_values(by=['score'], ascending=False).reset_index()

In [101]:
cols = list(X_cat.columns)

rankings = {name: [] for name in names}

for name in names:
    df = function_results[name].sort_values(by=['score'], ascending=False).reset_index()
    for col in cols:
        rankings[name].append(df[df.feature == col].index[0])
        
        
rankings = pd.DataFrame(rankings)
rankings['feature'] = cols
rankings['average'] = rankings.apply(lambda row: (row.chi2 + row.mi + row.forest)/3, axis=1)
rankings.sort_values(by=['average'])

Unnamed: 0,chi2,mi,forest,feature,average
4,0,0,0,lga,0.0
3,1,5,1,district_code,2.333333
2,5,2,2,region_code,3.0
0,10,3,4,basin,5.666667
14,7,11,5,payment_type,7.666667
9,3,7,13,extraction_type_group,7.666667
8,9,6,10,extraction_type,8.333333
6,8,10,7,scheme_management,8.333333
1,23,1,3,region,9.0
13,6,16,6,payment,9.333333


In [102]:
function_results['forest']

Unnamed: 0,index,feature,score
0,4,lga,0.105027
1,3,district_code,0.089271
2,2,region_code,0.073106
3,1,region,0.072547
4,0,basin,0.063151
5,14,payment_type,0.055648
6,13,payment,0.053919
7,6,scheme_management,0.049522
8,22,waterpoint_type,0.045702
9,11,management,0.036792
