In [3]:
import numpy as np
import pandas as pd
import os
import pickle

from datetime import timedelta, datetime
from random import randrange
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from time import time

In [5]:
os.chdir(r'C:/Users/mmandziej001/Desktop/FCU/SCRIPTS/predictive_qc_lion_king')

In [6]:
# load normalized train and test data
train_data = pd.read_csv(r'model_training\data\PC\dataset_dummy_grouped_time_train_cat8.csv')
test_data = pd.read_csv(r'model_training\data\PC\dataset_dummy_grouped_time_test_cat8.csv')
print(len(train_data), sum(train_data['Label']), round(sum(train_data['Label']) / len(train_data), 3))
print(len(test_data), sum(test_data['Label']), round(sum(test_data['Label']) / len(test_data), 3))

1452 44 0.03
363 14 0.039


In [8]:
# load features confirmed by boruta
boruta_importance = pd.read_excel('model_training/data/ai_assistant_dumps/PC/8_FATCA_CRS/Boruta_variable_importance_pc.xlsx')
boruta_importance = boruta_importance.sort_values(by=['normHits', 'meanImp'],
                                                  ascending=True)
boruta_features = list(
    boruta_importance[boruta_importance['decision'] == 'Confirmed'].V7)
boruta_pred_power = list(boruta_importance.V7)
print(boruta_features)

['TLAssignedName_Armannsson__G___Gabriela_', 'ScreenedParties', 'COFATCA/CRS_Major_last_10_checklists', 'PCFATCA/CRS_Major_last_5_checklists', 'CODocumentation_standard_Minor_last_5_checklists', 'COClient_Outreach_Major_last_5_checklists', 'PCDocumentation_standard_Major_last_10_checklists', 'DRClient_Outreach_Major_last_5_checklists', 'PCDocumentation_standard_Minor_last_10_checklists', 'PCDocumentation_standard_Major_last_5_checklists', 'PCDocumentation_standard_Minor_last_5_checklists', 'TeamExperience', 'PCFATCA/CRS_Critical_last_5_checklists', 'ProjectExperience', 'PCFATCA/CRS_Critical_last_10_checklists', 'DRClient_Outreach_Major_last_10_checklists', 'CRS_FALSE', 'CRS_TRUE', 'FATCA_FALSE', 'FATCA_TRUE']


In [9]:
# remove technical and label columns
train_id = train_data.pop('Unique')
test_id = test_data.pop('Unique')

train_labels = train_data.pop('Label')
test_labels = test_data.pop('Label')

train_datestamp = train_data.pop('datestamp')
test_datestamp = test_data.pop('datestamp')

In [10]:
### LOGISTIC REGRESSION - BEST MODEL SEARCH
X_train = train_data[boruta_features]
X_test = test_data[boruta_features]

print("Computing regularization path ...")
C = [i/2 + 0.5 for i in range(1, 40, 1)]
results = []

Computing regularization path ...


In [11]:
for pen in ['l1', 'l2']:
    for pos_weight, neg_weight in zip([0.9, 0.91, 0.92, 0.93], [0.1, 0.09, 0.08, 0.07]):
        for c in C:
            clf = linear_model.LogisticRegression(
                penalty=pen, solver='liblinear', tol=1e-6, max_iter=int(1e6),
                C=c, intercept_scaling=10000., class_weight={0: neg_weight,
                                                             1: pos_weight})
    
            clf.fit(X_train, train_labels)
            train_prob = clf.predict_proba(X_train)[:, 1]
            test_prob = clf.predict_proba(X_test)[:, 1]
            auc_train = roc_auc_score(train_labels, train_prob)
            auc_test = roc_auc_score(test_labels, test_prob)
            print('Penalty:', pen, 'C:', c, 'auc_train', auc_train, 'auc_test', auc_test)
            if abs(auc_train - auc_test) < 0.015:
                results.append({'model': pen,
                                'penalty': c,
                                'auc_train': auc_train,
                                'auc_test': auc_test,
                                'overfit': round(auc_train - auc_test, 3),
                                'pos_weight': pos_weight,
                                'neg_weight': neg_weight,
                                'coefs': clf.coef_})

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=['auc_train'], ascending=False)

Penalty: l1 C: 1.0 auc_train 0.8979129003099174 auc_test 0.8956201391731478
Penalty: l1 C: 1.5 auc_train 0.8988329674586776 auc_test 0.8933688088415883
Penalty: l1 C: 2.0 auc_train 0.8995351239669421 auc_test 0.8915268112975849


KeyboardInterrupt: 

In [None]:
for idx, i in enumerate(results):
    if i['auc_train'0.7669462316176471] > 0.765 and i['auc_test'] > 0.765:
        print(idx, i['auc_train'], i['auc_test'])

In [None]:
### run best model
coefs = {feature: coef for feature, coef in zip(
    boruta_features, list(results_df.iloc[28,]['coefs'][0])) if coef != 0}

best_model = linear_model.LogisticRegression(penalty='l1', solver='liblinear',
                                      C=10, tol=1e-6, max_iter=int(1e6),
                                      intercept_scaling=10000.,
                                      class_weight={0: 0.08,
                                                    1: 0.92})

best_model.fit(X_train[coefs.keys()], train_labels)

In [None]:
train_prob = best_model.predict_proba(X_train[coefs.keys()])[:,1]
test_prob = best_model.predict_proba(X_test[coefs.keys()])[:,1]

train_predictions_df = pd.DataFrame(train_prob)
test_predictions_df = pd.DataFrame(test_prob)

test_results = pd.concat([test_id, test_datestamp, test_predictions_df, test_labels], axis=1)
test_results = test_results.rename(columns={0: "Score"})
test_results = test_results.sort_values(by='Score', axis=0, ascending=False)

SumTest = pd.DataFrame(np.cumsum(test_results['Label']))
SumTest = SumTest.rename(columns={'Label': 'Sum'})
test_results = pd.concat([test_results, SumTest], axis=1)
test_results.reset_index(inplace=True, drop=True)
X_coordinates = pd.DataFrame(np.arange(1, (test_results.shape[0]+1)) / (test_results.shape[0]))
X_coordinates = X_coordinates.rename(columns={0: 'X_coordinates'})
y_coordinates = pd.DataFrame(test_results['Sum']/test_results['Sum'].max())
y_coordinates = y_coordinates.rename(columns={'Sum': 'y_coordinates'})
test_results = pd.concat([test_results, X_coordinates, y_coordinates], axis=1)

# train results
train_results = pd.concat([train_id, train_datestamp, train_predictions_df, train_labels], axis=1)
train_results = train_results.rename(columns={0: "Score"})
train_results = train_results.sort_values(by='Score', axis=0, ascending=False)

SumTrain = pd.DataFrame(np.cumsum(train_results['Label']))
SumTrain = SumTrain.rename(columns={'Label': 'Sum'})
train_results = pd.concat([train_results, SumTrain], axis=1)

train_results.reset_index(inplace=True, drop=True)
X_coordinates = pd.DataFrame(np.arange(1, (train_results.shape[0]+1))/(train_results.shape[0]))
X_coordinates = X_coordinates.rename(columns={0: 'X_coordinates'})
y_coordinates = pd.DataFrame(train_results['Sum']/train_results['Sum'].max())
y_coordinates = y_coordinates.rename(columns={'Sum': 'y_coordinates'})
train_results = pd.concat([train_results, X_coordinates, y_coordinates], axis=1)

auc_test = roc_auc_score(test_labels, test_predictions_df)
auc_train = roc_auc_score(train_labels, train_predictions_df)
print(auc_train, auc_test)

In [None]:
results_df.to_excel(r'model_training\results\PC\lr\penalized_LR_results.xlsx')
test_results.to_csv(r'model_training\results\PC\lr\lr_test_results.csv')
train_results.to_csv(r'model_training\results\PC\lr\lr_train_results.csv')
pickle.dump(best_model, open(r'model_training\results\PC\penalized_LR_results.sav', 'wb'))