In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split



df = pd.read_excel('default of credit card clients.xls')
# df.rename(columns=df.iloc[0], inplace=True)
df.drop(df.index[0], axis=0, inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)

models = [LogisticRegression(max_iter=10000),RandomForestClassifier(), KNeighborsClassifier()]

numeric_features = ['X1', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',  'X20', 'X21', 'X22', 'X23']
categorical_features = ['X3', 'X4']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

logistic = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=10000))])
rf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', RandomForestClassifier())])

kn = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', KNeighborsClassifier())])

X = df.drop('Y', axis=1)
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
X_train.reset_index()

rf.fit(X_train, y_train)
kn.fit(X_train, y_train)
logistic.fit(X_train, y_train)

prob = logistic.predict_proba(X_test)[:,1]

def lift_chart(y_true, p_model):
    asort = np.argsort(p_model)[::-1]
    y_sorted = y_true[asort]
    positive_instances = np.sum(y_true)
    total_instances = len(y_true)
    positive_rate = positive_instances/total_instances
    model_values = []
    naive_model_values = []
    ideal_model_values = []
    for i in range(1, total_instances):
        model_values += [np.sum(y_sorted[:i])]
        naive_model_values += [np.round(positive_rate*i)]
        ideal_model_values += [min(i, positive_instances)]
    model_values = np.array(model_values)
    naive_model_values = np.array(naive_model_values)
    ideal_model_values = np.array(ideal_model_values)
    lift_area = np.sum(model_values-naive_model_values)
    ideal_area = np.sum(ideal_model_values-naive_model_values)
    return np.array(model_values), np.array(naive_model_values), np.array(ideal_model_values), lift_area/ideal_area

def plot_lift(y_true, p_model, filename='lift_chart.pdf'):
    real, naive, ideal, lift_value = lift_chart(y_true, p_model)
    x = np.arange(1, len(y_true))
    plt.plot(x, real, label="logistic regression")
    plt.plot(x, naive, label="naive")
    plt.plot(x, ideal, label="ideal")
    plt.legend()
    plt.savefig(filename, format='pdf')
    plt.show()
