In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, classification_report, f1_score, fbeta_score, make_scorer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from xgboost import XGBClassifier


pd.set_option('display.float_format', lambda x: '%.2f' % x)
RSEED = 42

In [2]:
df = pd.read_csv('data/cleaned/train.csv')
df_test = pd.read_csv('data/cleaned/test.csv')
df.head()


Unnamed: 0,disrict,client_id,client_catg,region,target,most_frequ_reading_rem,mean_counter_coeff,mean_consommation_per_month,mean_months,elec_max,gaz_max
0,63,train_Client_33962,11,101,0.0,6.0,1.0,113.27,4.0,1,1
1,62,train_Client_32174,11,301,0.0,6.0,1.0,8.11,3.4,1,1
2,69,train_Client_18868,11,107,0.0,6.0,1.0,70.77,4.0,1,1
3,62,train_Client_39728,11,310,0.0,6.0,1.0,144.65,4.11,1,0
4,60,train_Client_34246,11,101,0.0,6.0,1.0,120.93,3.96,1,1


## Split Target, Drop ID

In [3]:
y_train = df['target']
X_train = df.drop(['target', 'client_id'], axis=1)

In [4]:
# Helper: Check specific columns:
X_train.iloc[:,5].sort_values(ascending=False).nunique()

105312

In [5]:
# Helper (compare X_train and X_test below):
X_train.head(2)

Unnamed: 0,disrict,client_catg,region,most_frequ_reading_rem,mean_counter_coeff,mean_consommation_per_month,mean_months,elec_max,gaz_max
0,63,11,101,6.0,1.0,113.27,4.0,1,1
1,62,11,301,6.0,1.0,8.11,3.4,1,1


In [6]:
#Split target from test data
y_test = df_test['target']
X_test = df_test.drop(['target'], axis=1)

In [7]:
# Helper (compare X_train and X_test):
X_test.head(2)

Unnamed: 0,disrict,client_catg,region,reading_remarque,counter_coefficient,months_number,gaz,consommation_per_month,elec
0,69,11,104,8.0,1.0,4.0,0,52.0,1
1,62,11,301,6.0,1.0,4.0,0,117.75,1


## Scaling the data

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- consommation_per_month
- counter_coefficient
- elec
- gaz
- months_number
- ...
Feature names seen at fit time, yet now missing:
- elec_max
- gaz_max
- mean_consommation_per_month
- mean_counter_coeff
- mean_months
- ...


## Random Oversampler

In [None]:
# handling the imbalanced
ros = RandomOverSampler(random_state=RSEED)
X_train, y_train = ros.fit_resample(X_train, y_train)

sns.countplot(x=y_train)

## Baseline model 1: Decision Tree
<span style="color:red">X_train and X_test need to have the same columns!</span>

In [None]:
#baseline model 1: Decision tree
baseline_tree = DecisionTreeClassifier(random_state=RSEED, max_depth=3)
baseline_tree.fit(X_train, y_train)

In [None]:
print(f'Decision tree has {baseline_tree.tree_.node_count} nodes with maximum depth {baseline_tree.tree_.max_depth}.')
print(f'On average there are ca. {X_train.shape[0]/baseline_tree.tree_.node_count:.1f} data points in each leaf.')

In [None]:
fig = plt.figure(figsize=(25,10))
dectree_plot = plot_tree(baseline_tree, filled=True)

In [43]:
# Make probability predictions for X_train
train_probs1 = baseline_tree.predict_proba(X_train)[:, 1]
train_predictions1 = baseline_tree.predict(X_train)

In [None]:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs1)}')

In [None]:
print(confusion_matrix(y_train, train_predictions1))
print(classification_report(y_train, train_predictions1))

In [None]:
# Make probability predictions test data
test_probs1 = baseline_tree.predict_proba(X_test)[:, 1]
test_predictions1 = baseline_tree.predict(X_test)

In [None]:
print(f'Test ROC AUC Score: {roc_auc_score(y_test, test_probs1)}')

In [None]:
print(confusion_matrix(y_test, test_predictions1))
print(classification_report(y_test, test_predictions1))

# Baseline model 2: Logistic Regression

In [49]:
# baseline 2: logistic Regression

baseline_log_reg = LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=1)
baseline_log_reg.fit(X_train, y_train)

# Make probability predictions for X_train
train_probs2 = baseline_log_reg.predict_proba(X_train)[:, 1]
train_predictions2 = baseline_log_reg.predict(X_train)

In [None]:
# Results X_train prediction:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs2)}')
print(confusion_matrix(y_train, train_predictions2))
print(classification_report(y_train, train_predictions2))

In [None]:
# Make probability predictions for X_test
test_probs2 = baseline_log_reg.predict_proba(X_test)[:, 1]
test_predictions2 = baseline_log_reg.predict(X_test)

# Results X_test prediction:
print(f'Test ROC AUC Score: {roc_auc_score(y_test, test_probs2)}')
print(confusion_matrix(y_test, test_predictions2))
print(classification_report(y_test, test_predictions2))

### Model 1: Random Forest

In [None]:
# Create the model with 1000 trees
model1 = RandomForestClassifier(n_estimators=1000, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

# Fit on training data
model1.fit(X_train, y_train)

In [None]:
# Make predictions for X_train
train_probs_1 = model1.predict_proba(X_train)[:, 1]
train_predictions_1 = model1.predict(X_train)

In [None]:
# Results X_train prediction:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_1)}')
print(confusion_matrix(y_train, train_predictions_1))
print(classification_report(y_train, train_predictions_1))

In [None]:
# Make predictions for X_test
test_probs_1 = model1.predict_proba(X_test)[:, 1]
test_predictions_1 = model1.predict(X_test)

In [None]:
# Results X_test prediction:
print(f'Test ROC AUC Score: {roc_auc_score(y_test, test_probs_1)}')
print(confusion_matrix(y_test, test_predictions_1))
print(classification_report(y_test, test_predictions_1))

### Model 2: Stacking
<span style="color:red">Integrate GridSearchCV or RandomizedGridSearchCV!</span>

In [None]:
# Stack the models Decision Tree, KNN and Random Forest
estimators = [
    ('dt', DecisionTreeClassifier(random_state = RSEED)),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier(random_state = RSEED))
]

# Fit model to training data
model2 = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())
model2.fit(X_train, y_train).score(X_test, y_test)

In [None]:
# Make predictions for X_train
train_probs_2 = model2.predict_proba(X_train)[:, 1]
train_predictions_2 = model2.predict(X_train)

In [None]:
# Results X_train prediction:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_2)}')
print(confusion_matrix(y_train, train_predictions_2))
print(classification_report(y_train, train_predictions_2))

In [None]:
# Make predictions for X_test
test_probs_2 = model1.predict_proba(X_test)[:, 1]
test_predictions_2 = model1.predict(X_test)

In [None]:
# Results X_test prediction:
print(f'Test ROC AUC Score: {roc_auc_score(y_test, test_probs_2)}')
print(confusion_matrix(y_test, test_predictions_2))
print(classification_report(y_test, test_predictions_2))

### Model 3: Boosting (XGBoost)

# Fit model to training data
model3 = XGBClassifier(random_state=RSEED,
                    n_jobs=-1,
                    n_estimators=1000,
                    learning_rate=0.3,
                    subsample=0.5,
                    )
                    
model3.fit(X_train, y_train)

## Evaluate the models

In [None]:
#NOCH ANZUPASSEN!


def evaluate_model(predictions, probs, train_predictions, train_probs):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
   baseline_tree = {}
    
   baseline_tree['recall'] = recall_score(y_test, [1 for _ in range(len(y_test))])
   baseline_tree['precision'] = precision_score(y_test, [1 for _ in range(len(y_test))])
   baseline_tree['roc'] = roc_auc_score(y_test, y_probs)

   baseline_log_reg = {}
    
   baseline_log_reg['recall'] = recall_score(y_test, [1 for _ in range(len(y_test))])
   baseline_log_reg['precision'] = precision_score(y_test, [1 for _ in range(len(y_test))])
   baseline_log_reg['roc'] = roc_auc_score(y_test, y_probs)
    
    results = {}
    
    results['recall'] = recall_score(y_test, predictions)
    results['precision'] = precision_score(y_test, predictions)
    results['roc'] = roc_auc_score(y_test, probs)
    
    train_results = {}
    train_results['recall'] = recall_score(train_labels, train_predictions)
    train_results['precision'] = precision_score(train_labels, train_predictions)
    train_results['roc'] = roc_auc_score(train_labels, train_probs)
    
    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
    
    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');