In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import KernelPCA

In [2]:
def plot_roc_curve(y_true, y_pred):
    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(10, 10))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [1, 0], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [3]:
def auc(y_true, y_pred):
    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [4]:
def lr_na(df):
    X_train = df.copy()
    X_train.dropna(inplace = True)
    df_find = df.loc[:, list(set(df.columns))]
    targets = []
    for i in df_find.columns:
        if len(df_find[df_find[i].isna()]) != 0:
            targets.append(i)            
    for i in targets:
        X_train_target = X_train.loc[:, list(set(X_train.columns) - set(targets))]   
        Y_train_target = X_train[i]
        lr = LinearRegression()
        lr.fit(X_train_target, Y_train_target)
        df_test = df_find[df_find[i].isna()]
        X_test = df_test.loc[:, list(set(df_test.columns) - set(targets))]
        X_test = X_test[X_train_target.columns]
        Y_test = lr.predict(X_test)
        df_find.loc[df_find[i].isna(), i] = Y_test.ravel()
#       targets.remove(i)     
    df = df_find.copy()
    return df

In [5]:
def PLS_na(df):
    X_train = df.copy()
    X_train.dropna(inplace = True)
    df_find = df.loc[:, list(set(df.columns))]
    targets = []
    for i in df_find.columns:
        if len(df_find[df_find[i].isna()]) != 0:
            targets.append(i)  
    current_targets = targets.copy()
    
    for i in targets:
        X_train_target = X_train.loc[:, list(set(X_train.columns) - set(current_targets))]   
        Y_train_target = X_train[i]   
        pls = PLSRegression(n_components = 5)
        pls.fit(X_train_target, Y_train_target)
        df_test = df_find[df_find[i].isna()]
        X_test = df_test.loc[:, list(set(df_test.columns) - set(current_targets))]
        X_test = X_test[X_train_target.columns]
        Y_test = pls.predict(X_test)
        df_find.loc[df_find[i].isna(), i] = Y_test.ravel()
        current_targets.remove(i)     
    df = df_find.copy()
    return df

In [6]:
file_path = 'C:/Users/Frederik/Desktop/TrainSample.csv'
df = pd.read_csv(file_path)
y_train = df["INS"].to_frame()

df1 = df
cat_vars=['BRANCH','RES'] 
for var in cat_vars: 
    cat_list='var'+'_'+var 
    cat_list = pd.get_dummies(df1[var], prefix=var) 
    hr1=df1.join(cat_list) 
    df1=hr1
df = df1.loc[:, list(set(df1.columns) - {'BRANCH','RES', "INS", "id"})].copy()

# df = lr_na(df)
# df = PLS_na(df)

for i in df.columns[df.isnull().any(axis=0)]:
    df[i].fillna(df[i].mean(),inplace=True)

In [7]:
file_path_test = 'C:/Users/Frederik/Desktop/TestSample.csv'

df_test = pd.read_csv(file_path_test)

y_test = df_test["INS"].to_frame()

df1 = df_test
cat_vars=['BRANCH','RES'] 
for var in cat_vars: 
    cat_list='var'+'_'+var 
    cat_list = pd.get_dummies(df1[var], prefix=var) 
    hr1=df1.join(cat_list) 
    df1=hr1
df_test = df1.copy()
df_test = df_test.loc[:, list(set(df_test.columns) - {"BRANCH", "RES", "id", "INS"})]

# df_test = PLS_na(df_test)
# df_test = lr_na(df_test)

for i in df_test.columns[df_test.isnull().any(axis=0)]:
    df_test[i].fillna(df_test[i].mean(),inplace=True)

df_test = df_test[df.columns]

In [8]:
df_bin = pd.DataFrame()
df_bin_test = pd.DataFrame()

df_not_bin = pd.DataFrame()
df_not_bin_test = pd.DataFrame()
for i in df.columns:
    if len(df.loc[df[i] == 0, i]) + len(df.loc[df[i] == 1, i]) == len(df[i]):
        df_bin[i] = df[i]
        df_bin_test[i] = df_test[i]
    else:
        df_not_bin[i] = df[i]
        df_not_bin_test[i] = df_test[i]

In [9]:
# scaler = StandardScaler()  
scaler = MinMaxScaler()
scaler.fit(df_not_bin)

X_train_not_bin = scaler.transform(df_not_bin)
X_test_not_bin = scaler.transform(df_not_bin_test)

poly_reg = PolynomialFeatures(degree = 1)
X_train = poly_reg.fit_transform(X_train_not_bin) 
X_test = poly_reg.transform(X_test_not_bin)

X_train_bin = df_bin.values
X_test_bin = df_bin_test.values

X_train_fin = np.concatenate((X_train, X_train_bin), axis=1)
X_test_fin = np.concatenate((X_test, X_test_bin), axis=1)

In [10]:
pca_model = PCA()
pca_model.fit(X_train_fin)
X_train = pca_model.transform(X_train_fin)
X_test = pca_model.transform(X_test_fin)

best_feats = None
features = list(range(X_train.shape[1]))
Scores = []

while len(features) > 0:
    model = sm.Logit(y_train.values.ravel(), X_train[:, features]).fit()
    L_score = auc(y_train, model.predict(X_train[:, features]))
    Scores.append(L_score)
    if max(Scores) == L_score:
        best_feats = features.copy()
    model = sm.Logit(y_train.values.ravel(), X_train[:, features])
    res = model.fit()
    worse_feature = features[np.argmax(res.pvalues)]
    features.remove(worse_feature)

         Current function value: 0.577826
         Iterations: 35
         Current function value: 0.577826
         Iterations: 35




         Current function value: 0.577826
         Iterations: 35
         Current function value: 0.577826
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.577826
         Iterations 7




Optimization terminated successfully.
         Current function value: 0.577826
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577826
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577826
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577829
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577829
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577833
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577833
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577837
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577837
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.577846
  

In [11]:
clf = sm.Logit(y_train.values.ravel(), X_train[:, best_feats]).fit(method = "newton")
L_score_train = auc(y_train, clf.predict(X_train[:, best_feats]))
L_score_test = auc(y_test, clf.predict(X_test[:, best_feats]))

print(L_score_train)
print(L_score_test)

Optimization terminated successfully.
         Current function value: 0.577833
         Iterations 7
0.790915359447866
0.7700227686703096


In [21]:
clf.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,5309.0
Model:,Logit,Df Residuals:,5246.0
Method:,MLE,Df Model:,62.0
Date:,"Sun, 17 Apr 2022",Pseudo R-squ.:,0.1029
Time:,21:45:16,Log-Likelihood:,-3067.7
converged:,True,LL-Null:,-3419.5
Covariance Type:,nonrobust,LLR p-value:,1.6210000000000002e-109

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.2716,0.051,5.351,0.000,0.172,0.371
x2,0.6776,0.055,12.363,0.000,0.570,0.785
x3,0.2580,0.058,4.477,0.000,0.145,0.371
x4,1.1800,0.066,17.972,0.000,1.051,1.309
x5,-0.1928,0.065,-2.968,0.003,-0.320,-0.065
x6,0.0835,0.067,1.243,0.214,-0.048,0.215
x7,0.2585,0.074,3.483,0.000,0.113,0.404
x8,-0.1522,0.077,-1.972,0.049,-0.303,-0.001
x9,1.0263,0.091,11.291,0.000,0.848,1.204
