# User's Creditworthiness Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sn
from keras.models import Sequential
from keras.layers import Dense

### Loading Data

In [2]:
# OPENING RAW DATAFRAME
df = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v4_/raw_data.csv')

### Preparing Data

In [3]:
# CLEANING AGE AND EXP VALUES
df['age'] = np.where(((df.age < 18)|(df.age>65)),np.nan,df.age)
df['exp'] = np.where(((df.exp < 0)|(df.exp>47)),np.nan,df.exp)
df = df.dropna(axis = 'rows', subset = ['age','exp'])

# CALCULATING AGE AT WHICH DRIVER'S LICENSE STARTED
df['driving_start_age'] = df.age - df.exp

# EXTRACTING REGION CODE FROM PASSPORT DEPARTMENT CODE
df['pdc'] = '<'+df.PassportDepartmentCode.str[:2]+'>'

# REMOVING NOT APPLICABLE DATA
df = df[df.invoice_status != 'not applicable']

# CODING TARGET VALUES WITH 0 OR 1
df['target'] = np.where(df.invoice_status == 'not_paid', 1, 0)

### Preparing and Organizing Categorical Dataframes

In [4]:
# PREPARING DEVICES DATAFRAME
devices_lib = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/devices_lib.csv')

In [5]:
# PREPARING KBM DATAFRAME
df['kbm_grouped'] = np.where(df['kbm']<0.7,'0.5+',\
                              np.where(df['kbm']<0.8,'0.7+',\
                                       np.where(df['kbm']<0.9,'0.8+',\
                                                np.where(df['kbm']<1,'0.9+',\
                                                         np.where(df['kbm'] == 1, '1',\
                                                                  np.where(df['kbm']<2.3,'1.4+',\
                                                                           np.where(df['kbm']>=2.3,'2.3+','?')))))))
df = df.replace('?', np.NaN)

In [6]:
# PREPARING BIRTH PLACE REGION DATAFRAME
bp = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_lib.csv')
bp = bp[['PassportBirthPlace','country','region']]
bp = bp.replace('None', np.nan)
bp = bp.dropna(axis='rows')
bp = bp.drop_duplicates(subset = ['PassportBirthPlace'])

# BIRTH PLACES LIB
bp_clsfied = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_classified_lib.csv')

# COUNTRIES
countries = bp_clsfied.drop_duplicates(subset = 'bp_country')
countries = pd.concat([countries.iloc[1:2],countries.iloc[3:]], axis='rows')
countries = countries[['bp_country', 'bp_region_group_detailed']]

# MERGING
bp = pd.merge(bp, bp_clsfied, left_on = 'region', right_on = 'bp_region_group_detailed', how = 'left')
bp = pd.merge(bp, countries, left_on = 'country', right_on = 'bp_country', how = 'left')

bp['bp_region_group_detailed'] = np.where(pd.isnull(bp.bp_region_group_detailed_x) == True,bp.bp_region_group_detailed_y,\
                                            bp.bp_region_group_detailed_x)
bp = bp[['PassportBirthPlace', 'bp_region_group_detailed']]

In [7]:
# PREPARING PASSPORT DEPARTMENT CODES
pdc = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/psp_regions_codes_lib.csv')

In [8]:
# PREPARING MOBILE OPERATORS DATAFRAME
mob = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/mobile_codes_lib.csv')

In [9]:
# PREPARING LICENSE CATEGORY DATAFRAME
lcns = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/license_cat_lib.csv')

In [10]:
# PREPARING DELIMOBIL REGION DATAFRAME
deli_regions = pd.DataFrame({'deli_region_en':['St. Petersburg', 'Moscow', 'Krasnodar', 'Yekaterinburg', 'Tula', 'Novosibirsk',\
                                     'Samara', 'Nizhny Novgorod'],\
                             'deli_region_ru':['Санкт-Петербург','Москва','Краснодарский край', 'Свердловская область',\
                                            'Тульская область', 'Новосибирская область', 'Самарская область',\
                                            'Нижегородская область']})

In [11]:
# LOADING CITIZENSHIP DATAFRAME
cz = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/user_citizenship.csv')

### Enriching Original Dataframe with Categorical Data

In [12]:
# DATA ENRICHMENT
df = pd.merge(df, devices_lib, left_on = 'device_type', right_on = 'device', how = 'left')
df = pd.merge(df, bp, left_on = 'birth_place', right_on = 'PassportBirthPlace', how = 'left')
df = pd.merge(df, mob, on = 'mobile_code', how = 'left')
df = pd.merge(df, lcns, on = 'license_category', how = 'left')
df = pd.merge(df, pdc, left_on = 'pdc', right_on = 'psp_dep_region_code', how = 'left')
df = pd.merge(df, deli_regions, left_on = 'region_name_en', right_on = 'deli_region_en', how = 'left')
df = pd.merge(df, cz, on = 'user_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16554 entries, 0 to 16553
Data columns (total 49 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        16554 non-null  int64  
 1   login                          16554 non-null  int64  
 2   age                            16554 non-null  float64
 3   exp                            16554 non-null  float64
 4   birth_place                    12469 non-null  object 
 5   kbm                            13075 non-null  float64
 6   sex                            16554 non-null  object 
 7   device_type                    14254 non-null  object 
 8   region_name_en                 16554 non-null  object 
 9   mobile_code                    16554 non-null  int64  
 10  license_category               12381 non-null  object 
 11  PassportDepartmentCode         5635 non-null   object 
 12  PassportRegistration           5830 non-null  

### Calculating Features

In [13]:
# CALCULATING AGE OF DEVICE MODEL ON THE USER'S ACTIVATION DATE
df['thld_year'] = pd.DatetimeIndex(df.threshold_timestamp).year
df['device_age_at_thld_date']  = df.thld_year-df.device_release_year
df.device_age_at_thld_date = df.device_age_at_thld_date.astype('str')

# DOES USER MOVE AFTER BIRTH OR NOT
df['moved_after_birth'] = np.where(((df.bp_region_group_detailed.isnull())|(df.psp_region.isnull())),np.nan,\
                                   np.where(df.bp_region_group_detailed != df.psp_region, 'moved', 'stayed'))

# DOES USER USE DELI OUT OF A REGION WHERE HE GOT PASSPORT
df['deli_out_of_psp_region'] = np.where(((df.deli_region_ru.isnull())|(df.psp_region.isnull())),np.nan,\
                                   np.where(df.deli_region_ru != df.psp_region, 'moved', 'stayed'))

# KBM CORRESPONDING TO MIN VEHICLE OWNING PERIOD
legend = pd.DataFrame({'kbm':[2.45, 2.3, 1.55, 1.4, 1.0, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5],\
                       'min_vhcl_owning_period':[1, 1, 1, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
df = pd.merge(df, legend, on = 'kbm', how = 'left')
df['min_vhcl_owning_period'] = df['min_vhcl_owning_period'].fillna(0)

# MOBILE OS
df['mobile_os'] = np.where(df.brand == 'APPLE', 'iOS',\
                          np.where(df.brand.isnull(), np.nan, 'Android'))

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16554 entries, 0 to 16553
Data columns (total 55 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        16554 non-null  int64  
 1   login                          16554 non-null  int64  
 2   age                            16554 non-null  float64
 3   exp                            16554 non-null  float64
 4   birth_place                    12469 non-null  object 
 5   kbm                            13075 non-null  float64
 6   sex                            16554 non-null  object 
 7   device_type                    14254 non-null  object 
 8   region_name_en                 16554 non-null  object 
 9   mobile_code                    16554 non-null  int64  
 10  license_category               12381 non-null  object 
 11  PassportDepartmentCode         5635 non-null   object 
 12  PassportRegistration           5830 non-null  

### Testing Logistic Regression on Reg Data

In [14]:
# LEAVING ONLY COLUMNS THAT'LL BE USED FOR THE MODEL
features = ['mobile_operator', 'sex', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped', 'brand',\
            'device_age_at_thld_date', 'device_feature', 'region_name_en', 'license_category_grouped', 'driving_start_age',\
            'moved_after_birth', 'deli_out_of_psp_region', 'mobile_os', 'passport_citizenship', 'min_vhcl_owning_period']
target = ['target']
df = df[features+target]

# REPLACING NA VALUES WITH 'NaN'
df = df.replace('nan', np.nan)
for feature in ['mobile_operator', 'sex', 'bp_region_group_detailed', 'kbm_grouped', 'brand','device_age_at_thld_date',\
                'device_feature', 'region_name_en', 'license_category_grouped',\
                'moved_after_birth', 'deli_out_of_psp_region', 'mobile_os', 'passport_citizenship']:
    df[feature] = df[feature].fillna('NaN')
    
# SPLITTING DATASET INTO X AND y
df = df.reset_index(drop=True)
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

# ONE-HOT ENCODING
enc = preprocessing.OneHotEncoder()
X_obj = X.loc[:, X.dtypes == object]
X_flt = X.loc[:, X.dtypes == float]
enc.fit(X_obj)
X = pd.DataFrame(enc.transform(X_obj).toarray())
X = X.join(X_flt)

# SPLITTING X AND y TO TRAIN AND TEST SAMPLES
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# CONVERTING X AND y DATAFRAMES TO ARRAYS
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#LogisticRegression
logreg = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=0,C=0.01).fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)

y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(classification_report(y_test,y_pred))
print(roc_auc_score(y_test, y_pred_proba1))

              precision    recall  f1-score   support

           0       0.75      0.59      0.66      2179
           1       0.44      0.62      0.52      1132

    accuracy                           0.60      3311
   macro avg       0.60      0.61      0.59      3311
weighted avg       0.65      0.60      0.61      3311

0.6444992921510662


In [19]:
y['1']  = 'count'
y.groupby('target').count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,1
target,Unnamed: 1_level_1
0,10948
1,5606


In [26]:
coefs = pd.DataFrame(logreg.coef_)
coefs.columns = list(enc.get_feature_names())+['age','exp', 'driving_start_age', 'min_vhcl_owning_period']

# BRINGING AGE AND EXP COEFS TO COMPARABLE FORMAT
# coefs.age_std = np.std(X.age)
# coefs.exp_std = np.std(X.exp)
# coefs.age = coefs.age*coefs.age_std
# coefs.exp = coefs.exp*coefs.exp_std

coefs = coefs.T
print(coefs.to_string())
coefs.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v4_/feature_importance.csv', encoding = 'utf-8-sig')

                                             0
x0_NaN                               -0.036834
x0_Билайн                            -0.005246
x0_Др.                               -0.097892
x0_МТС                               -0.129354
x0_Мегафон                           -0.028924
x0_Теле2                              0.163183
x1_Ж                                 -0.282492
x1_М                                  0.147425
x2_NaN                                0.213149
x2_Алтайский край                    -0.001382
x2_Амурская область                   0.008825
x2_Армения                            0.125899
x2_Архангельская область             -0.032978
x2_Астраханская область               0.004876
x2_Беларусь                          -0.025343
x2_Белгородская область               0.026440
x2_Брянская область                   0.012653
x2_Владимирская область              -0.038126
x2_Волгоградская область              0.049852
x2_Вологодская область                0.012055
x2_Воронежска

In [20]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,162,163,164,165,166,167,age,exp,driving_start_age,min_vhcl_owning_period
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,25.930185,1.059548,24.870637,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,35.953457,12.035592,23.917864,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,38.105407,18.617385,19.488022,10.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,34.729637,14.951403,19.778234,2.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,27.846680,9.448323,18.398357,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16549,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,23.039014,2.737851,20.301164,0.0
16550,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,24.566735,2.595483,21.971253,2.0
16551,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,34.565366,6.042437,28.522930,0.0
16552,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,41.598905,13.409993,28.188912,7.0


### Decision Tree

In [15]:
tree = DecisionTreeClassifier(criterion='entropy', max_features = 'auto', class_weight='balanced',\
                              random_state=0).fit(X_train, y_train)
y_pred = tree.predict(X_test)
y_pred_proba = tree.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.5286483725015608


### Decision Tree with Bagging

In [16]:
tree_bagged = BaggingClassifier(base_estimator = tree, random_state=0).fit(X_train, y_train.ravel())
y_pred = tree_bagged.predict(X_test)
y_pred_proba = tree_bagged.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.5845706380042968


### Random Forest

In [17]:
rfc = RandomForestClassifier(n_estimators = 500, max_depth = 10, class_weight = 'balanced',\
                             random_state=0).fit(X_train, y_train.ravel())
y_pred = rfc.predict(X_test)
y_pred_proba = rfc.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.6527798489424497


### Random Forest with Bagging

In [18]:
rfc_bagged = BaggingClassifier(base_estimator = rfc, random_state=0).fit(X_train, y_train.ravel())
y_pred = rfc_bagged.predict(X_test)
y_pred_proba = rfc_bagged.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.6502443337031386


### Gradient Boosting

In [429]:
gbc = GradientBoostingClassifier(random_state=0).fit(X_train, y_train.ravel())
y_pred = gbc.predict(X_test)
y_pred_proba = gbc.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.6577599341057141


### Grid Search

In [395]:
parameters = {'learning_rate':[0.1, 0.2], 'n_estimators':[100, 200], 'max_depth':[1, 2] }
grid = GridSearchCV(gbc, parameters).fit(X_train, y_train.ravel())

y_pred = grid.predict(X_test)
y_pred_proba = grid.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.6467889481430417


### Voting Classifier

In [399]:
vc = VotingClassifier(estimators = [('lr', logreg),('rfc', rfc),('gbc', gbc)], voting = 'soft').fit(X_train, y_train.ravel())

y_pred = vc.predict(X_test)
y_pred_proba = vc.predict_proba(X_test)

y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(roc_auc_score(y_test, y_pred_proba1))

0.6464926099525939


### Observing Feature Importance

## TESTING COMBINED MODEL (OUR AND QIWI) PERFORMANCE

## HYPER PARAMETERS TUNING

In [100]:
clsfr = GridSearchCV(LogisticRegression(solver = 'liblinear', class_weight = 'balanced'),{
    'C':[0.005, 0.01,],
#     'max_iter':[50,100,200],
#     'tol':[0.0001,0.00001,0.001],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}, scoring = 'roc_auc')
clsfr.fit(X_train, y_train.ravel())
clsfr.cv_results_

{'mean_fit_time': array([0.84308958, 0.94939451]),
 'std_fit_time': array([0.02270155, 0.03068124]),
 'mean_score_time': array([0.01751966, 0.01804528]),
 'std_score_time': array([0.00077105, 0.00148475]),
 'param_C': masked_array(data=[0.005, 0.01],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.005}, {'C': 0.01}],
 'split0_test_score': array([0.69264778, 0.6934123 ]),
 'split1_test_score': array([0.73201843, 0.73244397]),
 'split2_test_score': array([0.70175886, 0.70297515]),
 'split3_test_score': array([0.70532725, 0.70757677]),
 'split4_test_score': array([0.68102601, 0.68025865]),
 'mean_test_score': array([0.70255567, 0.70333337]),
 'std_test_score': array([0.01695915, 0.0172999 ]),
 'rank_test_score': array([2, 1])}