In [1]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd

pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [2]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../data/processed/pickles/region_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/region_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/region_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/region_y_test.p', 'rb'))
    X = pickle.load(open('../../data/processed/pickles/region_X.p', 'rb'))
    y = pickle.load(open('../../data/processed/pickles/region_y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()

unique_regions = X.region.unique() 
print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.target.values.ravel()
y_test = y_test.target.values.ravel()
y = y.target.values.ravel()

standard = StandardScaler() 
x_train[['amount_tsh', 'gps_height', 
         'population', 'time_passed']] = standard.fit_transform(x_train[['amount_tsh', 
                                                                         'gps_height', 'population', 'time_passed']])

Original X: (57247, 54)	Original y: (57247, 1)
Train X: (51522, 53)	Train y: (51522, 1)
Test X: (5725, 53)	Test y: (5725, 1)


In [3]:
X['target'] = y

### Baseline Testing for Models and Stacked

In [None]:

models = {'log': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'dt': DecisionTreeClassifier(), 'Gaussian': GaussianNB(), 'LDA': LinearDiscriminantAnalysis(),
          'LinearSVC': LinearSVC(max_iter = 1250), 'SDGSVC': SGDClassifier(),  
          'rf': RandomForestClassifier(),
        }
#create stacked model
stack_m = [] 
for model, m in models.items(): 
    stack_m.append((model, m))
stack_model = StackingClassifier(estimators = stack_m, final_estimator = LogisticRegression(), cv = 5)
models['stacked'] = stack_model
unique_reg = list(enumerate(unique_regions))
pbar = tqdm(unique_reg)
final_results = {}
for idx, reg in pbar:
    #test each model and stacking
    results = []
    model_names = []
    for idx2, (model, m) in enumerate(models.items()): 
        pbar.set_description(f'({idx}/{len(unique_regions)}){reg}: Evaluating {model.upper()} {idx2}/{len(models)}')
        cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 5, random_state = 10)
        y_region = X[X.region==reg].target.values
        x_region = X[X.region == reg].drop('region', axis =1)
        scores = cross_val_score(m, x_region, y_region, scoring = 'accuracy', cv = cv, n_jobs = 12, 
                                 error_score = 'raise')
        results.append(scores)
        model_names.append(model)
    final_results[reg] = {'results': results, 'model_names': model_names}

    

(5/21)Pwani: Evaluating STACKED 8/9:  24%|██████████████████████████▉                                                                                      | 5/21 [02:01<06:39, 24.95s/it]

In [None]:
pickle.dump(final_results, open('../../models/RegionVanillaResults.p', 'wb'))
final_results