In [1]:
# Importing libraries
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from model_selection import ModelSelection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Model selection
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix
)

In [2]:
df = pd.read_csv("../data/processed/train.csv")
df.head()

Unnamed: 0,college,income,overage,leftover,house,handset_price,over_15mins_calls_per_month,average_call_duration,leave,reported_satisfaction_avg,...,reported_usage_level_avg,reported_usage_level_high,reported_usage_level_little,reported_usage_level_very_high,reported_usage_level_very_little,considering_change_of_plan_actively_looking_into_it,considering_change_of_plan_considering,considering_change_of_plan_never_thought,considering_change_of_plan_no,considering_change_of_plan_perhaps
0,1,98418,65,0,226957,394,5,11,0,0,...,0,0,0,1,0,0,1,0,0,0
1,1,135755,0,0,377016,535,29,14,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0,149358,212,63,634847,558,20,1,1,0,...,0,0,1,0,0,0,0,1,0,0
3,1,35289,207,22,873341,202,22,6,0,1,...,0,1,0,0,0,0,1,0,0,0
4,1,83384,36,0,178986,275,0,12,0,0,...,0,0,0,1,0,0,0,0,1,0


In [3]:
cols = ['income', 'overage', 'leftover', 'house', 'handset_price',
       'over_15mins_calls_per_month', 'average_call_duration']

In [4]:
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])


In [5]:
X = df.drop("leave", axis=1)
y = df["leave"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(random_state=42),
    "XGBoost": XGBClassifier(),
    "LogisticRegression": LogisticRegression(random_state=42),
    "NaiveBayes": GaussianNB(),
    "KNearestNeighbor": KNeighborsClassifier(),
    "GradientBoost": GradientBoostingClassifier(random_state=42, warm_start=True),
    "AdaBoost": AdaBoostClassifier(
        random_state=42,
        estimator=DecisionTreeClassifier(
            random_state=42, max_depth=2,
            min_samples_leaf=3,
            min_samples_split=3
        )
    )
}

In [7]:
model_selector = ModelSelection(models)
best_models = model_selector.fit_search(X_train, y_train)

Output()

KeyboardInterrupt: 

In [None]:
model_selector.best_search_(best_models)

In [None]:
model_selector.search_metrics(X_train, y_train, X_test, y_test, best_models)