## Модель по платежеспособности 11/23/2020
1. Модель на основе регистрационных данных
    * Низкое качество предсказания (варьируется в зависимости от типа штрафа)
    * Варианты улучшения качества предсказания:
        * Генерация новых фичей
        * Различные статистические техники (метод K-folds итд)
        * Добавить разбивку штрафов по договору на более мелкие категории
        * Исправить ошибки на которые ругается Python
        * Использовать другие типы зависимостей (не только линейные) для параметров, например логистическую для возраста/стажа
2. Модель на основе данных об использовании сервиса
    * Качество предсказание неплохое (так же варьируется в зависимости от типа штрафа)
    * Дальнейшие действия:
        * То же самое что и для модели на основе регистрационных данных + добавление параметра % оплаты и разбивка датасета
          на временные интервалы для тестирования качества модели

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.compose import make_column_transformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sn

## PREPARING DATA

In [20]:
# OPENING RAW DATAFRAME
df_train = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/raw_data_1_2019_2_2020.csv')
df_test = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/raw_data_3_2020_8_2020.csv')

# REMOVING DUPLICATES
df_train = df_train.drop_duplicates('user_id')
df_test = df_test.drop_duplicates('user_id')

# OPENING CONTRACT PENALTIES BY TYPE AND MERGING IT WITH THE MAIN DATAFRAME
ct_train = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/raw_data_contract_penalties_1_2019_2_2020.csv')
ct_test = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/raw_data_contract_penalties_3_2020_8_2020.csv')
df_train = pd.merge(df_train,ct_train, on = 'user_id', how = 'left')
df_test = pd.merge(df_test,ct_test, on = 'user_id', how = 'left')

# CLEANING AGE AND EXP VALUES
df_train['age'] = np.where(((df_train.age < 18)|(df_train.age>65)),np.nan,df_train.age)
df_test['age'] = np.where(((df_test.age < 18)|(df_test.age>65)),np.nan,df_test.age)
df_train['exp'] = np.where(((df_train.exp < 0)|(df_train.exp>47)),np.nan,df_train.exp)
df_test['exp'] = np.where(((df_test.exp < 0)|(df_test.exp>47)),np.nan,df_test.exp)
df_train = df_train.dropna(axis = 'rows', subset = ['age','exp'])
df_test = df_test.dropna(axis = 'rows', subset = ['age','exp'])
df_train = df_train.fillna('NaN')
df_test = df_test.fillna('NaN')

# CONVERTING OBJECT TYPES TO FLOAT
for i in ['years_since_last_ride','rents_count', 'bill_total','last_month_ride', 'avg_week_rents']:
    df_train[i] = df_train[i].replace('NaN',np.nan)
for i in ['years_since_last_ride','rents_count', 'bill_total','last_month_ride', 'avg_week_rents']:
    df_test[i] = df_test[i].replace('NaN',np.nan)
df_train = df_train.dropna(axis = 'rows', subset = ['years_since_activation','years_since_last_ride','rents_count',\
                                                'bill_total','last_month_ride', 'avg_week_rents'])
df_test = df_test.dropna(axis = 'rows', subset = ['years_since_activation','years_since_last_ride','rents_count',\
                                                'bill_total','last_month_ride', 'avg_week_rents'])
df_train.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

# CALCULATING % OF PAID AGREEMENT INVOICES
def target_calc(df, paid_invoices_sum, invoices_sum, non_binary_column_name, binary_column_name):
    df[non_binary_column_name] = df[paid_invoices_sum]/df[invoices_sum]
    df[non_binary_column_name] = df[[non_binary_column_name,'1']].min(axis=1)
    df[binary_column_name] = np.where(df[non_binary_column_name] == 0.0,1,0)

df_train['1'] = 1 
df_test['1'] = 1
for i,e in [['total_invoices_sum', 'total_paid_invoices_sum'],['rent_invoices_sum', 'paid_rent_invoices_sum'],\
            ['camera_invoices_sum', 'paid_camera_invoices_sum'],['agreement_invoices_sum', 'paid_agreement_invoices_sum'],\
            ['accident_invoices_sum', 'paid_accident_invoices_sum'],['other_invoices_sum', 'paid_other_invoices_sum'],\
            ['agreement_accident_invoices_sum', 'paid_agreement_accident_invoices_sum'],\
            ['agreement_STD_invoices_sum','paid_agreement_STD_invoices_sum'],\
            ['agreement_evacuation_invoices_sum', 'paid_agreement_evacuation_invoices_sum'],\
            ['agreement_new_injuries_invoices_sum', 'paid_agreement_new_injuries_invoices_sum'],\
            ['agreement_other_invoices_sum', 'paid_agreement_other_invoices_sum']]:
    target_calc(df_train, e, i, '%_'+e, i[:-4]+'_no_payment')
    target_calc(df_test, e, i, '%_'+e, i[:-4]+'_no_payment')
df_train = df_train.drop('1',1)
df_test = df_test.drop('1',1)

  interactivity=interactivity, compiler=compiler, result=result)


### PREPARING AND ORGANIZING CATEGORICAL DATAFRAMES

In [21]:
# PREPARING DEVICES DATAFRAME
devices_lib = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/devices_lib.csv')
user_devices = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/user_devices.csv')
user_devices = pd.merge(user_devices, devices_lib, left_on = 'device_type', right_on = 'device', how = 'left')
user_devices = user_devices[['user_id', 'device_type', 'brand', 'device_gen', 'branded_gen_grouped', 'gen_grouped']]

In [22]:
# PREPARING KBM DATAFRAME
kbm = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/user_kbm.csv')
kbm = kbm[['user_id', 'kbm']]
kbm['kbm_grouped'] = np.where(kbm['kbm']<0.7,'0.5+',\
                              np.where(kbm['kbm']<0.8,'0.7+',\
                                       np.where(kbm['kbm']<0.9,'0.8+',\
                                                np.where(kbm['kbm']<1,'0.9+',\
                                                         np.where(kbm['kbm'] == 1, '1',\
                                                                  np.where(kbm['kbm']<2.3,'1.4+',\
                                                                           np.where(kbm['kbm']>=2.3,'2.3+',np.nan)))))))

In [23]:
# PREPARING BIRTH PLACE REGION DATAFRAME
bp = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_lib.csv')
bp = bp[['PassportBirthPlace','country','region']]
bp = bp.replace('None', np.nan)
bp = bp.dropna(axis='rows')
bp = bp.drop_duplicates(subset = ['PassportBirthPlace'])

# BIRTH PLACES LIB
bp_clsfied = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_classified_lib.csv')

# COUNTRIES
countries = bp_clsfied.drop_duplicates(subset = 'bp_country')
countries = pd.concat([countries.iloc[1:2],countries.iloc[3:]], axis='rows')
countries = countries[['bp_country', 'bp_region_group_detailed']]

# MERGING
bp = pd.merge(bp, bp_clsfied, left_on = 'region', right_on = 'bp_region_group_detailed', how = 'left')
bp = pd.merge(bp, countries, left_on = 'country', right_on = 'bp_country', how = 'left')

bp['bp_region_group_detailed'] = np.where(pd.isnull(bp.bp_region_group_detailed_x) == True,bp.bp_region_group_detailed_y,\
                                            bp.bp_region_group_detailed_x)
bp = bp[['PassportBirthPlace', 'bp_region_group_detailed']]

In [24]:
# PREPARING MOBILE OPERATORS DATAFRAME
mob = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/mobile_codes_lib.csv')

In [25]:
# PREPARING LICENSE CATEGORY DATAFRAME
lcns = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/license_cat_lib.csv')

In [26]:
# PREPARING PASSPORT REGION DATAFRAME
psp_region = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/psp_regions_codes_lib.csv')
psp_region['psp_region_code'] = psp_region['psp_dep_region_code'].str.replace('<','')
psp_region['psp_region_code'] = psp_region['psp_region_code'].str.replace('>','')
psp_region = psp_region[['psp_region_code', 'psp_region', 'psp_fed_district']]

In [27]:
# PREPARING SCORING DATAFRAME
scr = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/users_deli_scores_lib.csv')

### ENRICHING ORIGINAL DATAFRAME WITH CLASSIFIED CATEGORICAL DATA

In [28]:
# DATA ENRICHMENT
df_train = pd.merge(df_train, user_devices, on = 'user_id', how = 'left')
df_train = pd.merge(df_train, kbm, on = 'user_id', how = 'left')
df_train = pd.merge(df_train, bp, left_on = 'birth_place', right_on = 'PassportBirthPlace', how = 'left')
df_train = pd.merge(df_train, mob, on = 'mobile_code', how = 'left')
df_train = pd.merge(df_train, lcns, on = 'license_category', how = 'left')

df_test = pd.merge(df_test, user_devices, on = 'user_id', how = 'left')
df_test = pd.merge(df_test, kbm, on = 'user_id', how = 'left')
df_test = pd.merge(df_test, bp, left_on = 'birth_place', right_on = 'PassportBirthPlace', how = 'left')
df_test = pd.merge(df_test, mob, on = 'mobile_code', how = 'left')
df_test = pd.merge(df_test, lcns, on = 'license_category', how = 'left')

# # SIMPLE MODIFICATION OF SOME VARIABLES
df_train['PassportDepartmentRegionCode'] = df_train['PassportDepartmentCode'].str[:2]
df_test['PassportDepartmentRegionCode'] = df_test['PassportDepartmentCode'].str[:2]

# # DATA ENRICHMENT CONTINUED
df_train = pd.merge(df_train, psp_region, left_on = 'PassportDepartmentRegionCode', right_on = 'psp_region_code', how = 'left')
df_test = pd.merge(df_test, psp_region, left_on = 'PassportDepartmentRegionCode', right_on = 'psp_region_code', how = 'left')
df_train = pd.merge(df_train, scr, on = 'user_id', how = 'left')
df_test = pd.merge(df_test, scr, on = 'user_id', how = 'left')

# REPLACE ALL NAN VALUES WITH STRING TYPE NAN FOR REGRESSION
df_train = df_train.fillna('NaN')
df_test = df_test.fillna('NaN')

### SPLITTING USER'S DATA INTO TWO CATEGORIES: REGISTRATIONAL AND SERVICE USAGE

In [34]:
# SPLITTING USER'S DATA INTO TWO CATEGORIES: REGISTRATIONAL AND SERVICE USAGE
reg_cols = ['user_id', 'mobile_operator', 'brand', 'branded_gen_grouped', 'gen_grouped','age',\
            'exp', 'bp_region_group_detailed', 'country', 'sex', 'region_name_en', 'license_category_grouped',\
            'kbm_grouped', 'PassportRegistration','psp_region', 'psp_fed_district']
service_usage_cols = ['user_id','years_since_registration', 'years_since_activation', 'years_since_last_ride', 'rents_count',\
                      'bill_total', 'bonus_total', 'last_month_ride', 'avg_week_rents', 'tariff','DrivingStyle_delimobilScore']
target_cols = list(df_train.columns[81:102])

df_usg_train = df_train[reg_cols+service_usage_cols+target_cols]
df_usg_test = df_test[reg_cols+service_usage_cols+target_cols]

# SAVING PROCESSED DATASETS
df_usg_train.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_usg_data_train.csv',index=False,\
                    encoding='utf-8-sig')
df_usg_test.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_usg_data_test.csv',index=False,\
                    encoding='utf-8-sig')

### EXPLANATORY ANALYSIS OF THE REGISTRATIONAL DATA

In [159]:
# EXPLANATORY ANALYSIS OF THE REGISTRATIONAL DATA
def groupby_algo(df,column,column_to_calc):
    grpd = df.groupby(column).sum()
    grpd[column_to_calc] = grpd['paid_accident_invoices_sum']/grpd['accident_invoices_sum']
    grpd = grpd[[column_to_calc]]
    grpd = grpd.sort_values(by = column_to_calc, ascending = False, na_position = 'first')
#     grpd = grpd.sort_values(by = column, ascending = True, na_position = 'first') # NUMBER-SPECIFIC LINE
    if len(grpd.index) > 10:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(grpd)
            plt.plot(grpd)
    else:
        return grpd
        plt.plot(grpd)

groupby_algo(df_reg, 'gen_grouped', '%_paid_accident_invoices')

Unnamed: 0_level_0,%_paid_accident_invoices
gen_grouped,Unnamed: 1_level_1
last,0.767647
previous,0.513073
before previous,0.471736
old,0.319628
other,0.290546


In [None]:
# SAVING DATASETS WITH NO NAN VALUES
df_reg_notna = df_reg[df_reg['bp_region_group_detailed'].notna()]
df_reg_notna.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_notna_data.csv',\
                    index=False, encoding='utf-8-sig')

# REMOVING NAN DATA FROM THE DATASET
df_reg_notna_agrmnt = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_notna_agrmnt_data.csv')
df_reg_notna_agrmnt = df_reg_notna_agrmnt[df_reg_notna_agrmnt['agreement_invoices_sum'].notna()]
df_reg_notna_agrmnt.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_notna_agrmnt_data.csv',\
                           index=False, encoding='utf-8-sig')

In [None]:
# LIMITING % OF PAID INVOICES WITH 100%
agrmnt['1'] = 1
agrmnt['%_paid_agreement_invoices'] = agrmnt[['1','%_paid_agreement_invoices']].min(axis=1)
agrmnt = agrmnt.drop('1',1)
agrmnt.info(verbose = True, null_counts = True)
agrmnt.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_notna_agrmnt_data.csv',\
              index=False, encoding='utf-8-sig')

## TESTING LINEAR REGRESSION ON THE INVOICE LEVEL

In [118]:
# UPLOADING AGREEMENT INVOICES DATA
agrmnt = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_notna_agrmnt_data.csv')

# CHECKING FOR MULTICOLLINEARITY
corrMatrix_df = agrmnt[list(agrmnt.columns[9:16])+list(agrmnt.columns[17:18])+list(agrmnt.columns[25:119])]
corrMatrix = corrMatrix_df.corr()
corrMatrix.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/corrMatrix.csv', index=False, encoding='utf-8-sig')

In [238]:
# OLS LINEAR REGRESSION
X = agrmnt[list(agrmnt.columns[9:16])+list(agrmnt.columns[25:119])+list(agrmnt.columns[121:122])]
y = agrmnt['%_paid_agreement_invoices']
model = sm.OLS(y, X).fit()
print(model.summary())

# WRITING DOWN PREDICTION VALUES
prediction = model.predict(X)
agrmnt['lin_prediction'] = prediction
agrmnt.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/vis_analysis.csv', index=False, encoding='utf-8-sig')

# CALCULATING RMSE
print(mean_absolute_error(y, prediction))

                                    OLS Regression Results                                    
Dep. Variable:     %_paid_agreement_invoices   R-squared (uncentered):                   0.792
Model:                                   OLS   Adj. R-squared (uncentered):              0.791
Method:                        Least Squares   F-statistic:                              1268.
Date:                       Tue, 17 Nov 2020   Prob (F-statistic):                        0.00
Time:                               18:26:12   Log-Likelihood:                         -16116.
No. Observations:                      34141   AIC:                                  3.244e+04
Df Residuals:                          34039   BIC:                                  3.330e+04
Df Model:                                102                                                  
Covariance Type:                   nonrobust                                                  
                                        coef    st

0.3213657618802598


In [156]:
# TESTING PREDICTION ON RANDOM VALUES
X_pred = X.iloc[0:0]
d1 = dict(zip(list(X.columns),[1,1,0,0,0,1,18]+[0]*5+[1]+[0]*88+[500]))
X_pred.loc[0] = [1,0,0,0,0,0,18]+[0]*70+[1]+[0]*23+[500] #TELE2;FEMALE;OTHER PHONE MODEL;18;CHECHEN;500
X_pred.loc[1] = [1,0,0,0,0,0,18]+[0]*70+[1]+[0]*23+[100000] #TELE2;FEMALE;OTHER PHONE MODEL;18;CHECHEN;100000
X_pred.loc[2] = [0,1,1,0,0,0,45]+[0]*5+[1]+[0]*88+[500] #NOT TELE2;MALE;LAST PHONE MODEL;45;MOSCOW;500
X_pred.loc[3] = [0,1,1,0,0,0,45]+[0]*5+[1]+[0]*88+[100000] #NOT TELE2;MALE;LAST PHONE MODEL;45;MOSCOW;100000
prediction = model.predict(X_pred)
prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


0    0.614887
1    0.391227
2    0.982487
3    0.758827
dtype: float64

## TESTING LOGISTIC REGRESSION ON THE USER LEVEL

In [166]:
# CONVERTING % VALUES INTO BOOLEAN
df_reg['default_user'] = np.where(df_reg['%_paid_agreement_invoices_sum'] == 0.0,1,0)

# DEFINING X AND Y
X = df_reg[['mobile_operator','sex', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped',\
            'license_category_grouped']]
X = X.fillna('NaN')
y = df_reg['default_user']

# ONE-HOT ENCODING VARIABLES
column_trans = make_column_transformer(
    (OneHotEncoder(), ['mobile_operator', 'sex', 'gen_grouped', 'bp_region_group_detailed','kbm_grouped',\
                       'license_category_grouped']), remainder='passthrough')

# PIPELINE (BY DATA SCHOOL)
logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced', max_iter = 100, verbose = True)
pipe = make_pipeline(column_trans, logreg)
cross_val_score(pipe, X, y, cv = 5, scoring = 'accuracy').mean()

# PIPELINE (BY SCIKIT)
model = make_pipeline(column_trans, logreg)
_ = model.fit(X,y)
y_pred = model.predict(X)
mae = mean_absolute_error(y,y_pred)
roc_auc = roc_auc_score(y,y_pred)
print(mae, roc_auc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/module

0.4027680277974213 0.6540949245271162


## TESTING LOGISTIC REGRESSION ON REG DATA WITH QIWI SCORING

In [39]:
# LOADING df_reg_qiwi DATAFRAME
df_reg = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_reg_data.csv', low_memory = False)

# UPLOADING QIWI RESULTS
qiwi = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/QIWI_test/deli_results_23112020.csv')

# ADDING GROUP NAME TO THE QIWI DATASET
group_name = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/QIWI_test/qiwi_exp_full.csv')
group_name['ID'] = group_name['id']
group_name = group_name[['ID', 'group_name']]

# MERGING GROUP NAME WITH THE QIWI DATASET
qiwi = pd.merge(qiwi, group_name, on = 'ID', how = 'left')

# MERGING QIWI DATASET WITH REGISTRATIONAL DATA
df_reg_qiwi = pd.merge(df_reg, qiwi, left_on = 'user_id', right_on = 'ID', how = 'left')
df_reg_qiwi= df_reg_qiwi.loc[pd.isnull(df_reg_qiwi['ID']) == False]

# DEFINING X AND Y
X = df_reg_qiwi[['mobile_operator','sex', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped',\
                 'license_category_grouped']]
X = X.fillna('NaN')

# Xq = df_reg_qiwi[['mobile_operator','sex', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped',\
#             'license_category_grouped', 'pd_basis_60_v3_4']]
Xq = df_reg_qiwi[['pd_basis_60_v3_4']]
Xq = Xq.fillna('NaN')

for target in ['%_total_paid_invoices_sum','%_paid_rent_invoices_sum','%_paid_camera_invoices_sum',\
               '%_paid_agreement_invoices_sum','%_paid_accident_invoices_sum','%_paid_other_invoices_sum',\
               '%_paid_agreement_accident_invoices_sum','%_paid_agreement_STD_invoices_sum',\
               '%_paid_agreement_evacuation_invoices_sum','%_paid_agreement_new_injuries_invoices_sum',\
               '%_paid_agreement_other_invoices_sum']:
    
    # CONVERTING % VALUES INTO BOOLEAN
    df_reg_qiwi['default_user'] = np.where(df_reg_qiwi[target] == 0.0,1,0)                  
    y = df_reg_qiwi['default_user']
    
    # RUNNING LOOP
    for x, name in [(X,'w/o Qiwi'),(Xq, 'with Qiwi')]:
        
        # SPLITTING DATASET INTO TRAIN AND TEST SAMPLES
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
        
        # ONE-HOT ENCODING VARIABLES
#         column_trans = make_column_transformer((StandardScaler(), list(x_train.loc[:, x_train.dtypes == float].columns)),
#                                                (OneHotEncoder(), ['mobile_operator', 'sex', 'gen_grouped', \
#                                                                   'bp_region_group_detailed','kbm_grouped',\
#                                                                   'license_category_grouped']),remainder = 'passthrough')
        column_trans = make_column_transformer((StandardScaler(), list(x_train.loc[:, x_train.dtypes == float].columns)),
                                               (OneHotEncoder(), list(x_train.loc[:, x_train.dtypes == object])),\
                                                remainder = 'passthrough')
        # PIPELINE SET UP
        logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced', max_iter = 1000)
        model = make_pipeline(column_trans, logreg)
        _ = model.fit(x_train,y_train)
        y_pred_train = model.predict(x_train)
        y_pred_test = model.predict(x_test)
        roc_auc_train = roc_auc_score(y_train,y_pred_train)
        roc_auc_test = roc_auc_score(y_test,y_pred_test)
        print(target,';',name,';','train:',roc_auc_train,';','test:',roc_auc_test)

%_total_paid_invoices_sum ; w/o Qiwi ; train: 0.6116973436112371 ; test: 0.5777334667820261
%_total_paid_invoices_sum ; with Qiwi ; train: 0.5784054007165509 ; test: 0.6088874786329364
%_paid_rent_invoices_sum ; w/o Qiwi ; train: 0.7065351353472977 ; test: 0.6972932223048834
%_paid_rent_invoices_sum ; with Qiwi ; train: 0.6703830101577148 ; test: 0.7372163768931715
%_paid_camera_invoices_sum ; w/o Qiwi ; train: 0.7282131023408951 ; test: 0.6659142346634901
%_paid_camera_invoices_sum ; with Qiwi ; train: 0.6778521949639347 ; test: 0.75248470409876
%_paid_agreement_invoices_sum ; w/o Qiwi ; train: 0.6580193362683782 ; test: 0.6356161786185117
%_paid_agreement_invoices_sum ; with Qiwi ; train: 0.640896640584613 ; test: 0.6449192619731787
%_paid_accident_invoices_sum ; w/o Qiwi ; train: 0.7506210276690591 ; test: 0.6448898294535044
%_paid_accident_invoices_sum ; with Qiwi ; train: 0.6184935969194113 ; test: 0.6588765093987302
%_paid_other_invoices_sum ; w/o Qiwi ; train: 0.5487402002800773

## TESTING LOGISTIC REGRESSION ON USAGE DATA WITH QIWI SCORING

In [40]:
# LOADING df_reg_qiwi DATAFRAME
df_usg_train = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_usg_data_train.csv', low_memory = False)
df_usg_test = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/processed_usg_data_test.csv', low_memory = False)

# UPLOADING QIWI RESULTS
qiwi = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/QIWI_test/deli_results_23112020.csv')

# ADDING GROUP NAME TO THE QIWI DATASET
group_name = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/QIWI_test/qiwi_exp_full.csv')
group_name['ID'] = group_name['id']
group_name = group_name[['ID', 'group_name']]

# MERGING GROUP NAME WITH THE QIWI DATASET
qiwi = pd.merge(qiwi, group_name, on = 'ID', how = 'left')

# MERGING QIWI DATASET WITH REGISTRATIONAL DATA
df_usg_qiwi_train = pd.merge(df_usg_train, qiwi, left_on = 'user_id', right_on = 'ID', how = 'left')
df_usg_qiwi_test = pd.merge(df_usg_test, qiwi, left_on = 'user_id', right_on = 'ID', how = 'left')
df_usg_qiwi_train = df_usg_qiwi_train.loc[(pd.isnull(df_usg_qiwi_train['ID']) == False)&(df_usg_qiwi_train.tariff != 'сказка банк')]
df_usg_qiwi_test = df_usg_qiwi_test.loc[(pd.isnull(df_usg_qiwi_test['ID']) == False)&(df_usg_qiwi_test.tariff != 'сказка банк')]

# DEFINING X AND Y
X_train = df_usg_qiwi_train[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
                 'license_category_grouped', 'kbm_grouped', 'psp_region', 'years_since_activation', 'years_since_last_ride',\
                 'rents_count', 'bill_total','last_month_ride', 'avg_week_rents', 'tariff']]
X_train = X_train.fillna('NaN')

X_test = df_usg_qiwi_test[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
                 'license_category_grouped', 'kbm_grouped', 'psp_region', 'years_since_activation', 'years_since_last_ride',\
                 'rents_count', 'bill_total','last_month_ride', 'avg_week_rents', 'tariff']]
X_test = X_test.fillna('NaN')

Xq_train = df_usg_qiwi_train[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
                  'license_category_grouped', 'kbm_grouped', 'psp_region', 'years_since_activation', 'years_since_last_ride',\
                  'rents_count', 'bill_total','last_month_ride', 'avg_week_rents', 'tariff', 'pd_basis_60_v3_4']]
Xq_train = Xq_train.fillna('NaN')

Xq_test = df_usg_qiwi_test[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
                  'license_category_grouped', 'kbm_grouped', 'psp_region', 'years_since_activation', 'years_since_last_ride',\
                  'rents_count', 'bill_total','last_month_ride', 'avg_week_rents', 'tariff', 'pd_basis_60_v3_4']]
Xq_test = Xq_test.fillna('NaN')

for target in ['%_total_paid_invoices_sum','%_paid_rent_invoices_sum','%_paid_camera_invoices_sum',\
               '%_paid_agreement_invoices_sum','%_paid_accident_invoices_sum','%_paid_other_invoices_sum',\
               '%_paid_agreement_accident_invoices_sum','%_paid_agreement_STD_invoices_sum',\
               '%_paid_agreement_evacuation_invoices_sum','%_paid_agreement_new_injuries_invoices_sum',\
               '%_paid_agreement_other_invoices_sum']:
    
    # CONVERTING % VALUES INTO BOOLEAN
    df_usg_qiwi_train['default_user'] = np.where(df_usg_qiwi_train[target] == 0.0,1,0) 
    Y_train = df_usg_qiwi_train['default_user']
    df_usg_qiwi_test['default_user'] = np.where(df_usg_qiwi_test[target] == 0.0,1,0) 
    Y_test = df_usg_qiwi_test['default_user']
    
    # RUNNING LOOP
    for x_train, y_train, x_test, y_test, name in [(X_train,Y_train,X_test,Y_test,'w/o Qiwi'),(Xq_train,Y_train,Xq_test,Y_test, 'with Qiwi')]:
        
        # SPLITTING DATASET INTO TRAIN AND TEST SAMPLES
#         x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
        
        # ONE-HOT ENCODING VARIABLES
        column_trans = make_column_transformer((StandardScaler(), list(x_train.loc[:, x_train.dtypes == float].columns)),
                                               (OneHotEncoder(), list(x_train.loc[:, x_train.dtypes == object].columns)),\
                                               remainder = 'passthrough')
        # PIPELINE SET UP
        logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced', max_iter = 1000)
        model = make_pipeline(column_trans, logreg)
        _ = model.fit(x_train,y_train)
        y_pred_train = model.predict(x_train)
        y_pred_test = model.predict(x_test)
        roc_auc_train = roc_auc_score(y_train,y_pred_train)
        roc_auc_test = roc_auc_score(y_test,y_pred_test)
        print(target,';',name,';','train:',roc_auc_train,';','test:',roc_auc_test)

%_total_paid_invoices_sum ; w/o Qiwi ; train: 0.7522814349586708 ; test: 0.5994648555043649
%_total_paid_invoices_sum ; with Qiwi ; train: 0.7576429871712687 ; test: 0.6050226764997059
%_paid_rent_invoices_sum ; w/o Qiwi ; train: 0.9193298019738192 ; test: 0.568996918703648
%_paid_rent_invoices_sum ; with Qiwi ; train: 0.9229444369979644 ; test: 0.5923893005856024
%_paid_camera_invoices_sum ; w/o Qiwi ; train: 0.9236030722693052 ; test: 0.668369314990077
%_paid_camera_invoices_sum ; with Qiwi ; train: 0.9308713183480419 ; test: 0.6822827180304165
%_paid_agreement_invoices_sum ; w/o Qiwi ; train: 0.8080870497756558 ; test: 0.6899127193282363
%_paid_agreement_invoices_sum ; with Qiwi ; train: 0.8137597607142847 ; test: 0.6939529810808518
%_paid_accident_invoices_sum ; w/o Qiwi ; train: 0.879028320432273 ; test: 0.7270074284081668
%_paid_accident_invoices_sum ; with Qiwi ; train: 0.8822009417370377 ; test: 0.7495406694358366
%_paid_other_invoices_sum ; w/o Qiwi ; train: 0.7405153871684282

In [35]:
df_usg_train.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 931857 entries, 0 to 931856
Data columns (total 48 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   user_id                                     931857 non-null  int64  
 1   mobile_operator                             931857 non-null  object 
 2   brand                                       931857 non-null  object 
 3   branded_gen_grouped                         931857 non-null  object 
 4   gen_grouped                                 931857 non-null  object 
 5   age                                         931857 non-null  float64
 6   exp                                         931857 non-null  float64
 7   bp_region_group_detailed                    931857 non-null  object 
 8   country                                     931857 non-null  object 
 9   sex                                         931857 non-null  object 
 

## TESTING REG DATA LOGISTIC REGRESSION ON EVERY TYPE OF INVOICE

In [36]:
X = df_reg[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
            'license_category_grouped', 'kbm_grouped', 'psp_region', 'pd_basis_60_v3_4']]

column_trans = make_column_transformer(
    (OneHotEncoder(), ['mobile_operator', 'brand', 'gen_grouped', 'bp_region_group_detailed', 'sex',\
            'license_category_grouped', 'kbm_grouped', 'psp_region']), remainder='passthrough')

# PIPELINE (BY DATA SCHOOL)
logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced', max_iter = 100, verbose = 1)
pipe = make_pipeline(column_trans, logreg)

# PIPELINE (BY SCIKIT)
model = make_pipeline(column_trans, logreg)

for i in ['camera_invoices_no_payment','agreement_invoices_no_payment','agreement_accident_invoices_no_payment',\
          'agreement_STD_invoices_no_payment','agreement_evacuation_invoices_no_payment',\
          'agreement_new_injuries_invoices_no_payment','agreement_other_invoices_no_payment','accident_invoices_no_payment']:
    y = df_reg[i]
    _ = model.fit(X,y)
    y_pred = model.predict(X)
    mae = mean_absolute_error(y,y_pred)
    roc_auc = roc_auc_score(y,y_pred)
    print(i, mae, roc_auc)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


camera_invoices_no_payment 0.19040282299147634 0.836845981753216


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


agreement_invoices_no_payment 0.29691111055888275 0.6917648614158272


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


agreement_accident_invoices_no_payment 0.2621455729231381 0.7647998529556517


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


agreement_STD_invoices_no_payment 0.26599736587062944 0.7528276463651161


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


agreement_evacuation_invoices_no_payment 0.16783877140230113 0.8874610043987748


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


agreement_new_injuries_invoices_no_payment 0.27258268929698565 0.7723138817177527


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


agreement_other_invoices_no_payment 0.2696503566014761 0.7484853751611168


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


accident_invoices_no_payment 0.24216595015034417 0.8040271864953341


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


## TESTING USAGE DATA LOGISTICAL REGRESSION ON EVERY TYPE OF INVOICE

In [91]:
# CONVERTING OBJECT TYPES TO FLOAT
for i in ['years_since_last_ride','rents_count', 'bill_total','last_month_ride', 'avg_week_rents']:
    df_usg[i] = df_usg[i].replace('NaN',np.nan)
df_usg = df_usg.dropna(axis = 'rows', subset = ['years_since_activation','years_since_last_ride','rents_count',\
                                                'bill_total','last_month_ride', 'avg_week_rents'])
df_usg.reset_index(drop = True, inplace = True)

# RUNNING THE MODEL
X = df_usg[['mobile_operator', 'brand', 'gen_grouped', 'age', 'exp', 'bp_region_group_detailed', 'sex',\
            'license_category_grouped', 'kbm_grouped', 'psp_region', 'years_since_activation', 'years_since_last_ride',\
            'rents_count', 'bill_total','last_month_ride', 'avg_week_rents', 'tariff']]

column_trans = make_column_transformer(
    (OneHotEncoder(), ['mobile_operator', 'brand', 'gen_grouped', 'bp_region_group_detailed', 'sex',\
                       'license_category_grouped', 'kbm_grouped', 'psp_region','tariff']), remainder='passthrough')

# PIPELINE (BY DATA SCHOOL)
logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced', max_iter = 100, verbose = 1)
pipe = make_pipeline(column_trans, logreg)

# PIPELINE (BY SCIKIT)
model = make_pipeline(column_trans, logreg)

for i in ['total_invoices_no_payment', 'rent_invoices_no_payment','camera_invoices_no_payment','agreement_invoices_no_payment',\
          'accident_invoices_no_payment', 'other_invoices_no_payment']:
    y = df_usg[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    mae_train = mean_absolute_error(y_train,y_train_pred)
    roc_auc_train = roc_auc_score(y_train,y_train_pred)
    mae_test = mean_absolute_error(y_test,y_test_pred)
    roc_auc_test = roc_auc_score(y_test,y_test_pred)
#     save_df = pd.concat([X, pd.Series(y), pd.Series(y_pred)], axis=1)
#     save_df.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v2/'+i+'.csv', index = False, encoding='utf-8-sig')
    print(i, mae_train, mae_test, roc_auc_train, roc_auc_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s finished


total_invoices_no_payment 0.3062220170490476 0.30678167606809503 0.7309764698539204 0.730773836633257


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.5s finished


rent_invoices_no_payment 0.49933705174579884 0.5006766498874358 0.6919890983802548 0.688949966983652


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.7s finished


camera_invoices_no_payment 0.16104688909687265 0.1611501783320264 0.808926667639014 0.8116478684253392


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.8s finished


agreement_invoices_no_payment 0.19738362043524818 0.1959533807199049 0.7422764960024931 0.7509037824632497


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s finished


accident_invoices_no_payment 0.3001005489085068 0.29956302329699236 0.7617959970424053 0.7708363454196085


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.2s finished


other_invoices_no_payment 0.3743981821094613 0.3731629271747654 0.6957219928638533 0.6979292430646642


In [18]:
df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1265056 entries, 0 to 1265055
Data columns (total 121 columns):
 #   Column                                      Non-Null Count    Dtype  
---  ------                                      --------------    -----  
 0   user_id                                     1265056 non-null  int64  
 1   mobile_code                                 1265056 non-null  int64  
 2   age                                         1265056 non-null  float64
 3   exp                                         1265056 non-null  float64
 4   birth_place                                 1265056 non-null  object 
 5   city                                        1265056 non-null  object 
 6   country                                     1265056 non-null  object 
 7   sex                                         1265056 non-null  object 
 8   region_name_en                              1265056 non-null  object 
 9   license_category                            1265056 non-