In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

In [None]:
appli = pd.read_csv('application_data.csv')
prev = pd.read_csv("previous_application.csv")

In [None]:
appli = appli.iloc[0:50000]
prev = prev.iloc[0:50000]
prev.shape, appli.shape

In [None]:
miss_application_data = pd.DataFrame((prev.isnull().sum())*100/prev.shape[0]).reset_index()
miss_application_data.head()

In [None]:
prev = prev.loc[:, prev.isnull().mean() < 0.30] #supprime les cols avec 30% de valeurs manquantes
prev.shape

In [None]:
appli = appli.loc[:, appli.isnull().mean() < 0.30] #supprime les cols avec 30% de valeurs manquantes
appli.shape

### rencodage selon type, possibilité de générer des graphs

In [None]:
cate = [i for i in prev.select_dtypes(include = object).columns if i not in ["type"] ]
num = [i for i in prev.select_dtypes(include = np.number).columns if i not in ['SK_ID_CURR'] + [ 'TARGET']]

In [None]:
application_data_x = appli[[x for x in appli.columns]]
previous_application_x = prev[[x for x in prev.columns]]
application_data_x["type"] = "application_data"
previous_application_x["type"] = "previous_application"
data = pd.concat([application_data_x,previous_application_x],axis=0) 
data = data[data['TARGET'].notna()]

## a partir d'ici, plus de NaN values pour les perfs du modèle

In [None]:
y = data.TARGET
X = data.drop(['SK_ID_CURR','TARGET'],axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(
                                                    X, y, train_size=0.8, test_size=0.2, random_state=0)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)


low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

### Encoding

In [None]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from One-Hot Encoding:") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

In [None]:
ff = pd.concat([OH_X_train, OH_X_valid], axis = 0)
fff = pd.concat([y_train, y_valid], axis = 0)
ff.shape, fff.shape

## Selection des features importantes + prédiction

In [None]:
X = abs(ff)  #independent columns
y = fff    #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
%matplotlib inline

In [None]:
X = abs(ff)
y = fff 
model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(7).plot(kind='barh')
plt.show()

In [None]:
Importantfeatures = list(feat_importances.nlargest(7).index)
Importantfeatures

In [None]:
datafinal = X[Importantfeatures]
datafinal

In [None]:
creel = pd.concat([datafinal,y],axis=1)
creel.head()

In [None]:
AV = AutoViz_Class()

In [None]:
res = AV.AutoViz(filename = "",dfte = creel, depVar="TARGET")

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
                                                    creel, y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
model = XGBClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
predict_train = model.predict(X_train)
predict_train
pred = pd.DataFrame(predict_train)
rslt_df = pred[pred[0] == 1]
rslt_df.head()

In [None]:
trainaccuracy = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset over 1 : ', trainaccuracy)

In [None]:
predict_test = model.predict(X_valid)

In [None]:
testaccuracy = accuracy_score(y_valid,predict_test)
testaccuracy

simulation pour un individu aléatoire

In [None]:
import random 
r = random.randint(0, len(X_valid))
x = pd.DataFrame(X_valid)
y.iloc[r,],y[y==1].sum() 

# à améliorer, possibilité de saisie des informations par le créancier

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
AmtCreditWidg = widgets.Box(
    [
        widgets.Label(value='Credit amount of the loan :'),
        widgets.BoundedFloatText(
            value=100,
            min=0,
            max=1000000000.0,
            step=0.1,
            disabled=False
        )
    ]
)

IncomeWidg = widgets.Box(
    [
        widgets.Label(value='Income of the client :'),
        widgets.BoundedFloatText(
            value=100000,
            min=0,
            max=1000000000.0,
            step=0.1,
            disabled=False
        )
    ]
)

DayBirthWidg = widgets.Box(
    [
        widgets.Label(value='How old are you :'),
        widgets.BoundedIntText(
            value=8000,
            min=0,
            max=1000000000,
            step=1,
            disabled=False
        )
    ]
)


DayPublishWidg = widgets.Box(
    [
        widgets.Label(value='How many days before the application did client change the identity document with which you applied for the loan :'),
        widgets.BoundedIntText(
            value=2000,
            min=0,
            max=100000000,
            step=1,
            disabled=False
        )
    ]
)

DayRegistrationWidg = widgets.Box(
    [
        widgets.Label(value='How many days before the application did client change his registration :'),
        widgets.BoundedIntText(
            value=1500,
            min=0,
            max=100000000,
            step=1,
            disabled=False
        )
    ]
)

RelativPopWidg = widgets.Box(
    [
        widgets.Label(value='Normalized population of region where you live :'),
        widgets.BoundedFloatText(
            value=0.5,
            min=0,
            max=1,
            step=0.000001,
            disabled=False
        )
    ]
)

HourWidg = widgets.Box(
    [
        widgets.Label(value='Approximately at what hour did you apply for the loan (h) :'),
        widgets.BoundedIntText(
            value=12,
            min=0,
            max=230000,
            step=1,
            disabled=False
        )
    ]
)


In [None]:
display(DayBirthWidg)
display(DayPublishWidg)
display(DayRegistrationWidg)
display(AmtCreditWidg)
display(IncomeWidg)
display(RelativPopWidg)
display(HourWidg)

In [None]:
result = [DayBirthWidg.children[1].value, DayPublishWidg.children[1].value, DayRegistrationWidg.children[1].value *365 , AmtCreditWidg.children[1].value, IncomeWidg.children[1].value, RelativPopWidg.children[1].value, HourWidg.children[1].value]
result = pd.DataFrame(result).transpose()
result

In [None]:
y = creel['TARGET']
x = creel.drop(['TARGET'],axis = 1)
x

In [None]:
result.columns=x.columns
result

In [None]:
model.fit(x,y)

In [None]:
pred = model.predict(result)
pred