In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import missingno as mi
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas._config.config.option_context at 0x18c74623e50>

In [2]:
path =r'C:\Users\nwenz\Desktop\P7_scoring/'
df = pd.read_csv(path + 'df_processed.csv')

In [3]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

In [4]:
print(np.shape(train_df))
print(np.shape(test_df))
print(np.shape(df))

(307507, 799)
(48744, 799)
(356251, 799)


In [5]:
train_df.TARGET.isna()

0         False
1         False
2         False
3         False
4         False
          ...  
307502    False
307503    False
307504    False
307505    False
307506    False
Name: TARGET, Length: 307507, dtype: bool

In [6]:
train_df.TARGET.value_counts().sum()

307507

In [7]:
np.shape(train_df)

(307507, 799)

In [8]:
train_df.isna().sum()

Unnamed: 0                               0
index                                    0
SK_ID_CURR                               0
TARGET                                   0
CODE_GENDER                              0
                                     ...  
CC_NAME_CONTRACT_STATUS_nan_MAX     220602
CC_NAME_CONTRACT_STATUS_nan_MEAN    220602
CC_NAME_CONTRACT_STATUS_nan_SUM     220602
CC_NAME_CONTRACT_STATUS_nan_VAR     221234
CC_COUNT                            220602
Length: 799, dtype: int64

In [9]:
train_df = train_df.dropna(axis=1).copy()
train_columns = list(train_df.columns)

In [10]:
train_columns

['Unnamed: 0',
 'index',
 'SK_ID_CURR',
 'TARGET',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20'

In [11]:
test_df = test_df[test_df.columns.intersection(train_columns)]
print(np.shape(train_df))
print(np.shape(test_df))

(307507, 183)
(48744, 183)


In [12]:
target = train_df.TARGET.copy()
print(len(target))
target.dropna(inplace = True)
print(len(target))

307507
307507


In [13]:
np.shape(train_df)

(307507, 183)

In [14]:
list(train_df.columns)

['Unnamed: 0',
 'index',
 'SK_ID_CURR',
 'TARGET',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20'

In [15]:
test_df.drop(columns = 'TARGET',inplace = True)

In [16]:
test_df.columns

Index(['Unnamed: 0', 'index', 'SK_ID_CURR', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'REGION_POPULATION_RELATIVE',
       ...
       'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed',
       'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others',
       'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_No',
       'EMERGENCYSTATE_MODE_Yes', 'INCOME_CREDIT_PERC'],
      dtype='object', length=182)

In [17]:
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
smote_enn = SMOTEENN(random_state=0)
X = train_df.drop(columns = 'TARGET')
y = train_df.TARGET.copy()
X_resampled, y_resampled = smote_enn.fit_resample(X, y)


KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
lr = LogisticRegression()
gb = GaussianNB()
svc = LinearSVC()
#lr.fit(X_resampled,y_resampled)

In [None]:
cross_val_score(gb,X_resampled,y_resampled,cv=5)

In [None]:
train_df.describe()

In [None]:
cross_val_score(gb,X,y,cv=5)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8)

In [None]:
y_pred = gb.fit(X_train,y_train).predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)