In [None]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import show
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline 
from sklearn import preprocessing

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    


def plot_single_var(df, var):
    sns.set(style="darkgrid")

    total = float(len(df)) # one person per row 
    #ax = sns.barplot(x="class", hue="who", data=titanic)
    ax = sns.countplot(x=var, data=df, order = df[var].value_counts().index) 
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{0:.0%}'.format(height/total),
                ha="center") 
    show()
    
def plot_category_compare(var,group,title,df):
    flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e","#708090", "#FFC0CB","#C71585", 
          "#7B68EE",'#4169E1','#6495ED',]
    # mpl.style.use('seaborn')
    # with sns.color_palette("husl", 8):
    #     ax = tb.plot(x = tb.index, kind='barh',stacked = True, title = title, mark_right = True)
    tb = pd.crosstab(index=df[var],  columns=[ df[group]], normalize='index')
    ax = tb.plot(x = tb.index, kind='barh',stacked = True, mark_right = True, color = flatui[:tb.shape[1]], title = title)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    total = float(len(tb)) # one person per row 
    size = tb.shape[0]
    cnt = 0
    for p in ax.patches:
        height = 0
        res =  cnt % size
        ax.text(p.get_x()+p.get_width()/2.,height + res,
                    '{0:.0%}'.format(p.get_width()),
                    ha="center")
        cnt += 1
    show()
    
    


In [None]:
    
from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
from sklearn.preprocessing import scale, StandardScaler, Imputer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix
from sklearn.model_selection import KFold # import KFold
def cross_val(cols,model):
    X_copy = X_train.reset_index()
    y_copy = y_train.reset_index()
    X_copy = X_copy.drop('index', axis = 1)
    y_copy = y_copy.drop('index', axis = 1)
    #df_y = np.where(df_ext['Performance'] == 'Good', 1,0)
    kf = KFold(n_splits=5,random_state=1234, shuffle=True) # Define the split - into 10 folds 
    kf.get_n_splits(X_copy) # returns the number of splitting iterations in the cross-validator
    
    auc = []
    pr_auc = []
    for train_index, test_index in kf.split(X_copy):
        X_tr, X_t = X_copy.loc[train_index, cols], X_copy.loc[test_index, cols]
        y_tr, y_t = y_copy.iloc[train_index,:], y_copy.iloc[test_index,:]
        model.fit(X_tr, y_tr.values.ravel()) 
        y_pred = model.predict_proba(X_t)[:,1]
        fpr, tpr, thresholds = metrics.roc_curve(y_t,y_pred)
        precision, recall, thresholds = metrics.precision_recall_curve(y_t,y_pred)
        auc.append(metrics.auc(fpr, tpr))
        pr_auc.append(metrics.auc(recall, precision, reorder=True))
    return [np.mean(auc),np.mean(pr_auc)]

def pred_model_res(X_train, y_train,X_test, y_test, model):
    model = model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test,y_pred)
    return [metrics.auc(fpr, tpr), metrics.auc(recall, precision, reorder=True)]

def search_model(x_train, y_train, est, param_grid, n_jobs = -1, cv = 5, refit=False):
##Grid Search for the best model
    model = GridSearchCV(estimator = est,
                         param_grid = param_grid,
                         scoring = 'roc_auc',
                         verbose = 50,
                         n_jobs = n_jobs,
                         iid = True,
                         refit = refit,
                         cv = cv)
    # Fit Grid Search Model
    model.fit(x_train, y_train)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    return model
    
    
def cross_val_select(X_train_trans, y_train_trans, model):
    kf = KFold(n_splits=10,random_state=1234, shuffle=True) # Define the split - into 10 folds 
    kf.get_n_splits(X_train_trans) # returns the number of splitting iterations in the cross-validator
    auc = []
    pr_auc = []
    for train_index, test_index in kf.split(X_train_trans):
        X_tr, X_t = X_train_trans[train_index], X_train_trans[test_index]
        y_tr, y_t = y_train.values[train_index], y_train.values[test_index]
        model.fit(X_tr, y_tr) 
        y_pred = model.predict_proba(X_t)[:,1]
        fpr, tpr, thresholds = metrics.roc_curve(y_t,y_pred)
        precision, recall, thresholds = metrics.precision_recall_curve(y_t,y_pred)
        auc.append(metrics.auc(fpr, tpr))
        pr_auc.append(metrics.auc(recall, precision, reorder=True))
    return [np.mean(auc),np.mean(pr_auc)]