In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

import pickle
import pandas as pd

pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [None]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../data/processed/pickles/basin_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/basin_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/basin_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/basin_y_test.p', 'rb'))
    X = pickle.load(open('../../data/processed/pickles/basin_X.p', 'rb'))
    y = pickle.load(open('../../data/processed/pickles/basin_y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()

unique_basin = X.basin.unique() 
print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.target.values.ravel()
y_test = y_test.target.values.ravel()
y = y.target.values.ravel()

standard = StandardScaler() 
x_train[['amount_tsh', 'gps_height', 
         'population', 'time_passed']] = standard.fit_transform(x_train[['amount_tsh', 
                                                                         'gps_height', 'population', 'time_passed']])

In [None]:
X['target'] = y
x_test['target'] = y_test
x_train['target'] = y_train

In [None]:
import os 

scores = []
basins = []
basin_model_dict= {} 
pbar = os.listdir('../../data/processed/pickles/basins/')
for i in os.listdir('../../data/processed/pickles/basins/'):
    if i == '.DS_Store': 
        continue 
    basins.append(i)
    #pbar.set_description(f'Testing Classifier for: {i.upper()}')
    x_train = pickle.load(open(f'../../data/processed/pickles/basins/{i}/x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/basins/{i}/x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/basins/{i}/y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/basins/{i}/y_test.p', 'rb'))
    
    x_train.drop('basin', axis =1, inplace = True)
    x_test.drop('basin', axis =1, inplace = True)
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()
    
    rf = RandomForestClassifier()
   
    
#     plot_confusion_matrix(rf, x_test, y_test)
    param_grid = {
     'criterion': ['gini', 'entropy'],
    'max_depth': [100, 250, 300, 350, 450],
    'min_samples_split': [5, 8,10, 15],
    'min_samples_leaf': [3, 5, 10, 15],
    'max_features': ['sqrt', 'log2'],
}
    standard = StandardScaler()
    x_train[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']] = standard.fit_transform(
                                 x_train[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']]
    )
    x_test[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']] = standard.fit_transform(
        x_test[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']]
    )
    cv = RepeatedStratifiedKFold(n_splits = 7, n_repeats = 5)
    gs = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, cv = cv, n_jobs = 13, verbose = 2, 
                            n_iter = 75)

    gs.fit(x_train, y_train)
    pickle.dump(gs, open(f'../../models/Basins/{i}_GridSearch.p', 'wb'))
    basin_dict = dict(model = gs, x_train = x_train, y_train = y_train, x_test = x_test, y_test = y_test) 
    basin_model_dict[i] = basin_dict
    

pickle.dump(basin_model_dict, open(f'../../models/Basins/TunedModels.p', 'wb'))
    

In [None]:
# fig, ax = plt.subplots(1, len(basin_model_dict.keys()))
for idx, i in enumerate(basin_model_dict.keys()): 
    print(i)
    basin_dict = basin_model_dict[i]
    model = basin_dict['model']
    print(model.best_estimator_)

In [None]:
orig_dict = {'Lake Nyasa' : 'nya', 'Lake Victoria' : 'vic', 'Pangani' : 'pang', 'Ruvuma / Southern Coast' : 'ruv', \
                 'Internal' : 'int', 'Lake Tanganyika' : 'tang', 'Wami / Ruvu' : 'wami', 'Rufiji' : 'ruf',
             'Lake Rukwa' : 'rukwa'}


new_dict = {y:i for i,y in orig_dict.items()}
# new_dict

In [None]:
fig, ax = plt.subplots(3, 3, figsize = (12,7))
cm_dict = {}
for idx, i in enumerate(basin_model_dict.keys()): 
    basin_dict = basin_model_dict[i]
    model =basin_dict['model'].best_estimator_
    x_test = basin_dict['x_test']
    y_test = basin_dict['y_test']
    plot_confusion_matrix(model, x_test, y_test, 
                          display_labels = ['F', 'R'], ax = ax[idx//3, idx%3], normalize = 'true')
    ax[idx//3, idx%3].set_title(f'{new_dict[i]}')
    cm_dict[i] = confusion_matrix(y_test, model.predict(x_test))
    
plt.tight_layout()
plt.savefig(f'figures/BASINS_confusion_matrix.png')

In [None]:
pickle.dump(cm_dict, open(f'../../models/Basins/BasinCMDict.p', 'wb'))


In [None]:
basin_model_dict = pickle.load(open(f'../../models/Basins/TunedModels.p', 'rb'))
pbar = tqdm(basin_model_dict.keys())
final_results = {} 
for bas in pbar: 
    pbar.set_description(f'KFold: {bas}')
    model = basin_model_dict[bas]['model'].best_estimator_
    x_train = basin_model_dict[bas]['x_train']
    x_test = basin_model_dict[bas]['x_test']
    X = x_train.append(x_test, ignore_index = True)
    
    y_train = pd.DataFrame(basin_model_dict[bas]['y_train'], columns = ['target'])
    y_test = pd.DataFrame(basin_model_dict[bas]['y_test'], columns = ['target'])
    y = y_train.append(y_test, ignore_index = True)
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 10)
    scores = cross_val_score(model, X, y.target.values, scoring = 'accuracy', cv = cv, n_jobs = 12, 
                             error_score = 'raise')
    final_results[bas]=  scores
    


In [None]:
final_results

In [None]:
plt.figure(figsize = (15,5))
names = [new_dict[i] for i in final_results.keys()]
results = [i[1] for i in final_results.items()]
plt.boxplot(results, labels = names)
plt.title('Accuracy By Region')
plt.xlabel('Region')
plt.ylabel('Accuracy')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.savefig('figures/ByRegionKFold.png')
plt.show()

In [None]:
import os 
from mlxtend.classifier import StackingCVClassifier


scores = []
basins = []
basin_model_dict= {} 
pbar = os.listdir('../../data/processed/pickles/basins/')
for i in os.listdir('../../data/processed/pickles/basins/'):
    if i == '.DS_Store': 
        continue 
    basins.append(i)
    #pbar.set_description(f'Testing Classifier for: {i.upper()}')
    x_train = pickle.load(open(f'../../data/processed/pickles/basins/{i}/x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/basins/{i}/x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/basins/{i}/y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/basins/{i}/y_test.p', 'rb'))
    
    x_train.drop('basin', axis =1, inplace = True)
    x_test.drop('basin', axis =1, inplace = True)
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()   
    
   
    standard = StandardScaler()
    x_train[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']] = standard.fit_transform(
                                 x_train[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']]
    )
    x_test[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']] = standard.fit_transform(
        x_test[['amount_tsh', 'gps_height', 'population', 'time_passed', 'longitude', 'latitude']]
    )
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3)
    classifiers = [DecisionTreeClassifier(), RandomForestClassifier()]
    sclf = StackingCVClassifier(classifiers = classifiers, meta_classifier = LogisticRegression(), random_state = 10)

    params = {
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
    'decisiontreeclassifier__splitter': ['best', 'random'],
    'decisiontreeclassifier__max_depth': [None, 125, 500, 1000],
    'decisiontreeclassifier__min_samples_split': [8, 10, 15],
    'decisiontreeclassifier__min_samples_leaf': [3, 5, 10], 
    'decisiontreeclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'decisiontreeclassifier__max_leaf_nodes': [None, 25,  50],
    
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_depth': [None, 50, 100, 150, 200],
    'randomforestclassifier__min_samples_split': [8,10, 15],
    'randomforestclassifier__min_samples_leaf': [3, 5, 10],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__max_leaf_nodes': [None, 25, 50], 
    'randomforestclassifier__bootstrap': [False, True],
      
    'meta_classifier__C': [.1, .5, 1, 1.25, 1.5, 2], 
    'meta_classifier__max_iter': [100, 1000,2000],
    'meta_classifier__penalty': ['l1', 'l2'],
    'meta_classifier__solver': ['sag', 'saga', 'liblinear', 'lbfgs', 'newton-cg'],
    'meta_classifier__multi_class': ['ovr', 'multinomial']    
}
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3)
    gs = RandomizedSearchCV(estimator = sclf, param_distributions = params, cv = cv, n_jobs = 13, verbose = 1, n_iter = 13)
    gs.fit(x_train,y_train)
    pickle.dump(gs, open(f'../../models/Basins/{i}_GridSearch.p', 'wb'))
    basin_dict = dict(model = gs, x_train = x_train, y_train = y_train, x_test = x_test, y_test = y_test) 
    basin_model_dict[i] = basin_dict
    

    