In [1]:
%autosave 1

Autosaving every 1 seconds


In [2]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, recall_score, f1_score, accuracy_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import os
import pandas as pd
pd.options.display.max_columns = None
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from IPython.display import display

In [3]:
## Data loading and data splitting functions

# read the csv file and return a dataframe
def data_read_df():
    PATH = 'dataset/'
    df = pd.read_csv(os.path.join(PATH, 'Telcom.csv'))
    return df

def data_split_train_and_test(df, test_size):
    sss = StratifiedShuffleSplit(n_splits=2, test_size=test_size, random_state=0)

    y = df['target']
    for train_index, test_index in sss.split(df, y):
        X_train, X_test = df.iloc[train_index], df.iloc[test_index]

    y_train = X_train['target']
    y_test = X_test['target']
    X_train = X_train.drop(columns=['target'])
    X_test = X_test.drop(columns=['target'])
    
    return X_train, X_test, y_train, y_test

# Visualiztion functions


def visual_generate_bar_char_plot(df, feature_name):
    if len(df[feature_name].unique()) > 20:
        g = sns.FacetGrid(df, aspect=4)
    else:
        g = sns.FacetGrid(df, aspect=2)
    g.map(sns.countplot, feature_name, order=list(
        df[feature_name].value_counts().index))
    g.set_xticklabels(rotation=80)
    g.set_ylabels("count")


def visual_generate_category_target_prob_plot(df, feature_name):
    if len(df[feature_name].unique()) > 20:
        x = df[feature_name]
        g = sns.catplot(x=feature_name, y="target", data=df,
                        kind="bar", palette="muted", order=list(df[feature_name].value_counts().index), aspect=4)
    else:
        g = sns.catplot(x=feature_name, y="target", data=df,
                        kind="bar", palette="muted", order=list(df[feature_name].value_counts().index), aspect=1)
    g.set_xticklabels(rotation=80)
    g.set_ylabels("Prob. for target 1")
    plt.ylim(0, 1)


def visual_generate_dis_plot(df, feature_name):
    g = sns.FacetGrid(df, col='target', aspect=1)
    g = g.map(sns.distplot, feature_name)
    
## Training Functions

def train_model(ml_algo, df_train, y_train, params=None):   
    ''' 
    ml_algo options: 
        'LogisticRegression' / 'DecisionTree' / 'KNN' / 'RandomForest' / 'LDA' / 'MultinomialNB / GaussianNB'
    '''
    if params is None:
        params = {}
    if ml_algo == 'LogisticRegression':
        if 'multi_class' not in params:
            params['multi_class'] = 'auto'
        if 'solver' not in params:
            params['solver'] = 'liblinear'
        classifier = LogisticRegression(**params)
    elif ml_algo == 'DecisionTree':
        classifier = DecisionTreeClassifier(**params)
    elif  ml_algo == 'KNN':
        classifier = KNeighborsClassifier(**params)
    elif ml_algo == 'RandomForest':
        if 'n_estimators' not in params:
            params['n_estimators'] = 100
        classifier = RandomForestClassifier(**params)
    elif ml_algo == 'LDA':
        classifier = LinearDiscriminantAnalysis(**params)
    elif ml_algo == 'MultinomialNB':
        classifier = MultinomialNB()
    elif ml_algo == 'GaussianNB':
        classifier = GaussianNB()
    else:
        raise(Exception("""The value of the ml-algo parameter should be one of the following:
                            LogisticRegression / DecisionTree / KNN / RandomForest / LDA / MultinomialNB / GaussianNB"""))
    
    classifier.fit(df_train, y_train)
    
    return classifier

## Evaluation functions

def eval_get_cm(classifier, X, y):
    cmtx = pd.DataFrame(
        confusion_matrix(classifier.predict(X), y),
        index=['pred:0', 'pred:1'],
        columns=['true:0', 'true:1']
    )
    return cmtx

def eval_get_score(classifier, X, y, metric):
    if metric == 'f1' or metric == 'auc':
        if (len(y_test.unique())) > 2 :
            raise(Exception('You cannot use the metric \'{}\' for multiclass classification tasks'.format(metric)))
    '''
    metric options:
        'precision' / 'recall' / 'accuracy' / 'auc' / 'f1'
    '''
    if metric == 'accuracy':
        return accuracy_score(y, classifier.predict(X), normalize=True)
    elif metric == 'f1':
        return f1_score(y, classifier.predict(X))
    elif metric == 'precision':
        from sklearn.metrics import precision_score
        return precision_score(y, classifier.predict(X))
    elif metric == 'recall':
        from sklearn.metrics import recall_score
        return recall_score(y, classifier.predict(X))
    elif metric == 'auc':
        predictions_proba = classifier.predict_proba(X)[:, 1]
        fpr, tpr, t = roc_curve(y, predictions_proba)
        roc_auc = auc(fpr, tpr)
    else:
        raise(Exception("""The value of the metric parameter sholud be one of the following:
                            precision / recall / accuracy / auc / f1"""))
    return roc_auc

def eval_plot_roc_curve(classifier, X, y):
    if (len(y_test.unique())) > 2:
        raise(Exception('You cannot use this function for multiclass classification tasks'))
    # predict probabilities
    y_pred_prob = classifier.predict_proba(X)

    fpr, tpr, t = roc_curve(y, y_pred_prob[:, 1])
    roc_auc = auc(fpr, tpr)
    fig1 = plt.figure(figsize=[12, 12])
    ax1 = fig1.add_subplot(111, aspect='equal')

    plt.plot(fpr, tpr, lw=2, alpha=0.3,
             label='AUC = ' + str(round(roc_auc, 2)))
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")

    plt.show()
    
def eval_get_predictions(classifier, X, y):
    predictions = classifier.predict(X)
    predictions_df = pd.DataFrame(
        {'example_index': list(X.index), 'Pred': list(predictions), 'True_value': list(y)}).set_index('example_index')
    return predictions_df

def get_predictions_proba(classifier, X, y):
    predictions = [round(pred,3) for pred in list(classdier.predict_proba(X)[:,1])]
    predictions_df = pd.DataFrame(
        {'example_index': list(X.index), 'pred': predictions, 'True_value': list(y)}).set_index('example_index')
    return predictions_df

## Feature engineering (FE) functions

def FE_encode_values_of_categorical_features(df, columns_to_encode):

    df_to_return = df.copy()
    le = LabelEncoder()
    for col in columns_to_encode:
        df_to_return[col] = le.fit_transform(df_to_return[col])
    return df_to_return

def FE_create_one_hot_encodeing(df, columns_to_encode):

    for x in columns_to_encode:
        df = pd.concat([df, pd.get_dummies(df[x], prefix=x)], axis=1)
        
    df = df.drop(columns = columns_to_encode)
    return df


def FE_divide_numeric_feature_to_ranges(df, column_to_divide_to_ranges, number_of_ranges):

    df_to_return = df.copy()
    df_to_return[column_to_divide_to_ranges] = pd.cut(df_to_return[column_to_divide_to_ranges], number_of_ranges)
    
    return df_to_return

## Feature Scaling Functions

def feature_scaling(X_train, X_test, columns_to_scale, scaler_method):

    X_train_to_return = X_train.copy()
    X_test_to_return = X_test.copy()
    
    if scaler_method == 'Normalizer':
        scaler = Normalizer()
    elif scaler_method == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_method == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        raise(Exception("""The value of the scaling_method parameter sholud be one of the following:
                            Normalize / StandardScaler / MinMaxScaler"""))
    
    for column in columns_to_scale:
        X_train_to_return[column] = X_train_to_return[column].astype(float)
        X_test_to_return[column] = X_test_to_return[column].astype(float)
    
    X_train_to_return[columns_to_scale] = scaler.fit_transform(X_train_to_return[columns_to_scale])
    X_test_to_return[columns_to_scale] = scaler.transform(X_test_to_return[columns_to_scale])
    
    return X_train_to_return, X_test_to_return

**Task instructions**

* This experiment will require you to develop a predictive model.

* We will provide a structured dataset (CSV file) which includes information about a telecom company’s customers (the company provides telecom and media services (e.g. Phone / Internet access and Streaming of TV and movies). The target column (what should be predicted) is whether a customer left in the past month.

* You will be asked to try and develop the best possible model for this scenario.

* Your model’s performance will be evaluated on a separate set of test examples

<a href="https://docs.google.com/document/d/12cTcNpGmuUReuIrGNvo9C1pPjXtSUWEs0G8TWT-9zGk/edit#heading=h.x54udswtnv63">Link to API</a>

**Dataset Description:**<br/>

**customerID:** The customer's ID<br/>
**gender:** Gender of the customer<br/>
**SeniorCitizen:** Wether or not this customer is a senior citizen (0,1)<br/>
**Partner:** Whether the customer has a partner or not (Yes, No)<br/>
**Dependents:** Whether the customer has dependents or not (Yes, No) <br/>
**tenure:** Number of months the customer has stayed with the company <br/>
**PhoneService:** Whether the customer has a phone service or not (Yes, No)<br/>
**MultipleLines:** Whether the customer has multiple lines or not (Yes, No, No phone service) <br/>
**InternetService:** Customer’s internet service provider (DSL, Fiber optic, No) <br/>
**OnlineSecurity:** Whether the customer has online security or not (Yes, No, No internet service)<br/>
**OnlineBackup:** Whether the customer has online backup or not (Yes, No, No internet service) <br/>
**DeviceProtection:** Whether the customer has device protection or not (Yes, No, No internet service) <br/>
**TechSupport:** Whether the customer has tech support or not (Yes, No, No internet service)<br/>
**StreamingTV:** Whether the customer has streaming TV or not (Yes, No, No internet service)<br/>
**StreamingMovies:** Whether the customer has streaming movies or not (Yes, No, No internet service)<br/>
**Contract:** The contract term of the customer (Month-to-month, One year, Two year)<br/>
**PaperlessBilling:** Whether the customer has paperless billing or not (Yes, No)<br/>
**PaymentMethod:** The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))<br/>
**MonthlyCharges:** The dollar amount charged to the customer monthly <br/>
**TotalCharges:** The total dollar amount charged to the customer <br/>
**target:** target variable. Did the customer leave in the past month? 

In [4]:
df = data_read_df()

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,target
0,4710-NKCAW,1,0,1,1,5,1,Yes,DSL,Yes,Yes,No,Yes,No,No,Month-to-month,0,Credit card (automatic),64.4,316.9,0
1,1264-FUHCX,0,0,1,0,49,0,No phone service,DSL,No,Yes,No,Yes,Yes,Yes,One year,1,Credit card (automatic),56.3,2780.6,0
2,2379-ENZGV,1,0,0,0,6,1,No,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,1,Electronic check,84.85,523.5,1
3,9831-BPFRI,0,0,1,1,39,1,Yes,Fiber optic,No,Yes,Yes,Yes,No,No,One year,1,Electronic check,89.55,3474.45,1
4,1100-DDVRV,1,0,1,0,17,0,No phone service,DSL,No,Yes,No,No,Yes,Yes,Month-to-month,1,Mailed check,49.8,836.35,0
