# User's Creditworthiness Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sn
from keras.models import Sequential
from keras.layers import Dense

### Preparing Data

In [3]:
# OPENING RAW DATAFRAME
df = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v4_/raw_data.csv')

# CLEANING AGE AND EXP VALUES
df['age'] = np.where(((df.age < 18)|(df.age>65)),np.nan,df.age)
df['exp'] = np.where(((df.exp < 0)|(df.exp>47)),np.nan,df.exp)
df = df.dropna(axis = 'rows', subset = ['age','exp'])

# REMOVING NOT APPLICABLE DATA
df = df[df.invoice_status != 'not applicable']
# CODING TARGET VALUES WITH 0 OR 1
df['target'] = np.where(df.invoice_status == 'not_paid', 1, 0)

### Preparing and Organizing Categorical Dataframes

In [4]:
# PREPARING DEVICES DATAFRAME
devices_lib = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/devices_lib.csv')

In [5]:
# PREPARING KBM DATAFRAME
df['kbm_grouped'] = np.where(df['kbm']<0.7,'0.5+',\
                              np.where(df['kbm']<0.8,'0.7+',\
                                       np.where(df['kbm']<0.9,'0.8+',\
                                                np.where(df['kbm']<1,'0.9+',\
                                                         np.where(df['kbm'] == 1, '1',\
                                                                  np.where(df['kbm']<2.3,'1.4+',\
                                                                           np.where(df['kbm']>=2.3,'2.3+','?')))))))
df = df.replace('?', np.NaN)

In [6]:
# PREPARING BIRTH PLACE REGION DATAFRAME
bp = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_lib.csv')
bp = bp[['PassportBirthPlace','country','region']]
bp = bp.replace('None', np.nan)
bp = bp.dropna(axis='rows')
bp = bp.drop_duplicates(subset = ['PassportBirthPlace'])

# BIRTH PLACES LIB
bp_clsfied = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_classified_lib.csv')

# COUNTRIES
countries = bp_clsfied.drop_duplicates(subset = 'bp_country')
countries = pd.concat([countries.iloc[1:2],countries.iloc[3:]], axis='rows')
countries = countries[['bp_country', 'bp_region_group_detailed']]

# MERGING
bp = pd.merge(bp, bp_clsfied, left_on = 'region', right_on = 'bp_region_group_detailed', how = 'left')
bp = pd.merge(bp, countries, left_on = 'country', right_on = 'bp_country', how = 'left')

bp['bp_region_group_detailed'] = np.where(pd.isnull(bp.bp_region_group_detailed_x) == True,bp.bp_region_group_detailed_y,\
                                            bp.bp_region_group_detailed_x)
bp = bp[['PassportBirthPlace', 'bp_region_group_detailed']]

In [7]:
# PREPARING MOBILE OPERATORS DATAFRAME
mob = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/mobile_codes_lib.csv')

In [8]:
# PREPARING LICENSE CATEGORY DATAFRAME
lcns = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/license_cat_lib.csv')

### ENRICHING ORIGINAL DATAFRAME WITH CLASSIFIED CATEGORICAL DATA

In [9]:
# DATA ENRICHMENT
df = pd.merge(df, devices_lib, left_on = 'device_type', right_on = 'device', how = 'left')
df = pd.merge(df, bp, left_on = 'birth_place', right_on = 'PassportBirthPlace', how = 'left')
df = pd.merge(df, mob, on = 'mobile_code', how = 'left')
df = pd.merge(df, lcns, on = 'license_category', how = 'left')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16554 entries, 0 to 16553
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        16554 non-null  int64  
 1   login                          16554 non-null  int64  
 2   age                            16554 non-null  float64
 3   exp                            16554 non-null  float64
 4   birth_place                    12469 non-null  object 
 5   kbm                            13075 non-null  float64
 6   sex                            16554 non-null  object 
 7   device_type                    14254 non-null  object 
 8   region_name_en                 16554 non-null  object 
 9   mobile_code                    16554 non-null  int64  
 10  license_category               12381 non-null  object 
 11  PassportDepartmentCode         5635 non-null   object 
 12  PassportRegistration           5830 non-null  

In [10]:
# CALCULATING AGE OF DEVICE MODEL ON THE USER'S ACTIVATION DATE
df['thld_year'] = pd.DatetimeIndex(df.threshold_timestamp).year
df['device_age_at_thld_date']  = df.thld_year-df.device_release_year
df.device_age_at_thld_date = df.device_age_at_thld_date.astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16554 entries, 0 to 16553
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        16554 non-null  int64  
 1   login                          16554 non-null  int64  
 2   age                            16554 non-null  float64
 3   exp                            16554 non-null  float64
 4   birth_place                    12469 non-null  object 
 5   kbm                            13075 non-null  float64
 6   sex                            16554 non-null  object 
 7   device_type                    14254 non-null  object 
 8   region_name_en                 16554 non-null  object 
 9   mobile_code                    16554 non-null  int64  
 10  license_category               12381 non-null  object 
 11  PassportDepartmentCode         5635 non-null   object 
 12  PassportRegistration           5830 non-null  

### TESTING LOGISTIC REGRESSION ON REG DATA

### Testing Neural Network Performance

In [17]:
# LEAVING ONLY COLUMNS THAT'LL BE USED FOR THE MODEL
features = ['mobile_operator', 'sex', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped', 'brand',\
            'device_age_at_thld_date', 'device_feature', 'region_name_en', 'license_category_grouped']
target = ['target']
df = df[features+target]

# REPLACING NA VALUES WITH 'NaN'
df = df.replace('nan', np.nan)
for feature in ['mobile_operator', 'sex', 'bp_region_group_detailed', 'kbm_grouped', 'brand','device_age_at_thld_date',\
                'device_feature', 'region_name_en', 'license_category_grouped']:
    df[feature] = df[feature].fillna('NaN')
    
# SPLITTING DATASET INTO X AND y
df = df.reset_index(drop=True)
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

# ONE-HOT ENCODING
enc = preprocessing.OneHotEncoder()
X_obj = X.loc[:, X.dtypes == object]
X_flt = X.loc[:, X.dtypes == float]
enc.fit(X_obj)
X = pd.DataFrame(enc.transform(X_obj).toarray())
X = X.join(X_flt)

# SPLITTING X AND y TO TRAIN AND TEST SAMPLES
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# CONVERTING X AND y DATAFRAMES TO ARRAYS
# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# NN
classifier = Sequential()
classifier.add(Dense(units = 64, activation = 'relu', input_dim = 159))
classifier.add(Dense(units = 32, activation = 'relu'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))

classifier.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy')
classifier.fit(X_train, y_train, batch_size = 1, epochs = 5)

y_pred = classifier.predict(X_test)

print(roc_auc_score(y_test, y_pred))

#LogisticRegression
# logreg = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=0,C=0.01).fit(X_train, y_train.ravel())
# y_pred = logreg.predict(X_test)

# y_pred_proba = logreg.predict_proba(X_test)
# y_pred_proba1 = []
# for i in y_pred_proba:
#     y_pred_proba1.append(i[1])
    
# print(classification_report(y_test,y_pred))
# print(roc_auc_score(y_test, y_pred_proba1))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.6042265802124374
