In [1]:
import pandas as pd
import numpy as np
import random
import string

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, auc, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from numpy import arange, argmax
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve


import matplotlib.pyplot as plt
%matplotlib inline


pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 1500)
pd.set_option("display.max_colwidth", None)

# Load data

In [2]:
df = pd.read_csv("all_files.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1269126 entries, 0 to 1269125
Data columns (total 30 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   reporting_date        1269126 non-null  object 
 1   customer_id           1269126 non-null  object 
 2   total_cust_exposure   1269126 non-null  int64  
 3   overdraft             1269126 non-null  int64  
 4   consumer_loan         1269126 non-null  int64  
 5   credit_card           1269126 non-null  int64  
 6   customer_rating       1269126 non-null  object 
 7   account_id            1269126 non-null  object 
 8   open_date             1269126 non-null  object 
 9   maturity_date         1269126 non-null  object 
 10  original_principal    1269126 non-null  int64  
 11  current_principal     1269126 non-null  int64  
 12  interest_rate         1269126 non-null  float64
 13  prepayment_status     1269126 non-null  object 
 14  l_period              1269126 non-

## towns_transforming

In [4]:
major_towns = ["Plovdiv", "Varna"]
big_towns = ["Burgas", "Ruse", "Stara Zagora", "Pleven"]
medium_towns = ["Sliven", 
                "Dobrich", 
                "Shumen", 
                "Pernik", 
                "Haskovo", 
                "Yambol", 
                "Pazardzhik", 
                "Blagoevgrad", 
                "Veliko Tarnovo", 
                "Vratsa", 
                "Gabrovo", 
                "Asenovgrad", 
                "Vidin", 
                "Kazanlak", 
                "Kyustendil", 
                "Kardzhali", 
                "Montana", 
                "Dimitrovgrad", 
                "Targovishte", 
                "Lovech", 
                "Silistra", 
                "Razgrad", 
                "Dupnitsa", 
                "Gorna Oryahovitsa", 
                "Smolyan", 
                "Svishtov"]



def new_town_labels(row):
    
#     print(row)
    if row == "Sofia":
        return "Capital"
    elif row in major_towns:
        return "Major Town"
    elif row in big_towns:
        return "Big Town"
    elif row in medium_towns:
        return "MediumS Town"
    else:
        return np.nan

In [5]:
df["town_x"] = df["town_x"].apply(new_town_labels)
df["town_y"] = df["town_y"].apply(new_town_labels)

## deducted_principal 

In [8]:
for row in df.index:
    if row == 0:
        df.loc[row, "deducted_principal"] = df.loc[row, "original_principal"] - df.loc[row, "current_principal"]
    else:
        df.loc[row, "deducted_principal"] = df.loc[row-1, "current_principal"] - df.loc[row, "current_principal"]


## total_mth_income / deducted_principal

In [9]:
df["income_vs_payment_ratio"] =  df["deducted_principal"] / df["tot_mnth_income"]

## debt_towards_bank

In [10]:
df["debt_towards_bank"] = df["total_cust_exposure"] - df["current_principal"]

In [13]:
def debt_towards_bank_labels(row):
    
#     print(row)
    if row <= 0:
        return 0
    else:
        return 1

In [15]:
df["debt_towards_bank_bool"] = df["debt_towards_bank"].apply(debt_towards_bank_labels)

## convert town to dummy/

In [21]:
df = df.join(pd.get_dummies(df["town_x"], prefix="town_x"), how="left")
df = df.join(pd.get_dummies(df["town_y"], prefix="town_y"), how="left")

## rating

In [23]:
def new_rating(row):
    
    val = row
    if "+" in val:
        res_val = float(val[0]+".5")
    elif "-" in val:
        res = float(val[0]+".9")
        res_val = res - 1
    else:
        res_val = float(val)
    
    return res_val

In [26]:
df["customer_rating_adj"] = df["customer_rating"].apply(new_rating)

In [27]:
df["customer_rating_adj"]

0          4.0
1          5.0
2          4.0
3          4.5
4          4.5
          ... 
1269121    5.5
1269122    4.9
1269123    5.5
1269124    3.9
1269125    3.9
Name: customer_rating_adj, Length: 1269126, dtype: float64

In [30]:
def new_rating_int(row):
    
    val = row
    if "+" in val:
        res_val = int(val[0])
    elif "-" in val:
        res_val = int(val[0])
    else:
        res_val = int(val)
    
    return res_val

In [31]:
df["customer_rating_int"] = df["customer_rating"].apply(new_rating_int)

## remove null rows

In [34]:
df.dropna(axis=0, inplace=True)

## convert target

In [56]:
def new_labels(row):
    return 0 if row == "no" else 1

In [55]:
df["prepayment_status"].value_counts()

no                                        884026
own                                        97670
refinanced other banks                     36092
refinanced-UC                               8842
refinanced-UC & own                         1530
refinanced-UC & refinanced other banks        64
refinanced other banks & own                  63
Name: prepayment_status, dtype: int64

In [57]:
df["prepayment_status_unique"] = df["prepayment_status"].apply(new_labels)

## to csv

In [62]:
df.drop(["customer_rating", "prepayment_status", "customer_rating_adj", "town_x", "town_y"], axis=1, inplace=True)

In [63]:
df.to_csv("all_files_v3.csv")

In [64]:
sample_df = df.sample(frac=0.1)
sample_df.to_csv("sample_all_v3.csv")

In [65]:
sample_df["prepayment_status_unique"].value_counts()

0    88407
1    14422
Name: prepayment_status_unique, dtype: int64

In [66]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102829 entries, 439400 to 295318
Data columns (total 44 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   reporting_date            102829 non-null  object 
 1   customer_id               102829 non-null  object 
 2   total_cust_exposure       102829 non-null  int64  
 3   overdraft                 102829 non-null  int64  
 4   consumer_loan             102829 non-null  int64  
 5   credit_card               102829 non-null  int64  
 6   account_id                102829 non-null  object 
 7   open_date                 102829 non-null  object 
 8   maturity_date             102829 non-null  object 
 9   original_principal        102829 non-null  int64  
 10  current_principal         102829 non-null  int64  
 11  interest_rate             102829 non-null  float64
 12  l_period                  102829 non-null  int64  
 13  exp_monthly_payments      102829 non-nu