In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("sample_data_intw.csv")

### Data Exploration and Preprocessing 

In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label,aon,daily_decr30,daily_decr90,rental30,rental90,last_rech_date_ma,last_rech_date_da,last_rech_amt_ma,...,cnt_loans30,amnt_loans30,maxamnt_loans30,medianamnt_loans30,cnt_loans90,amnt_loans90,maxamnt_loans90,medianamnt_loans90,payback30,payback90
count,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,...,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0,209593.0
mean,104797.0,0.875177,8112.343445,5381.402289,6082.515068,2692.58191,3483.406534,3755.8478,3712.202921,2064.452797,...,2.758981,17.952021,274.658747,0.054029,18.520919,23.645398,6.703134,0.046077,3.398826,4.321485
std,60504.431823,0.330519,75696.082531,9220.6234,10918.812767,4308.586781,5770.461279,53905.89223,53374.83343,2370.786034,...,2.554502,17.379741,4245.264648,0.218039,224.797423,26.469861,2.103864,0.200692,8.813729,10.308108
min,1.0,0.0,-48.0,-93.012667,-93.012667,-23737.14,-24720.58,-29.0,-29.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52399.0,1.0,246.0,42.44,42.692,280.42,300.26,1.0,0.0,770.0,...,1.0,6.0,6.0,0.0,1.0,6.0,6.0,0.0,0.0,0.0
50%,104797.0,1.0,527.0,1469.175667,1500.0,1083.57,1334.0,3.0,0.0,1539.0,...,2.0,12.0,6.0,0.0,2.0,12.0,6.0,0.0,0.0,1.666667
75%,157195.0,1.0,982.0,7244.0,7802.79,3356.94,4201.79,7.0,0.0,2309.0,...,4.0,24.0,6.0,0.0,5.0,30.0,6.0,0.0,3.75,4.5
max,209593.0,1.0,999860.755168,265926.0,320630.0,198926.11,200148.11,998650.377733,999171.80941,55000.0,...,50.0,306.0,99864.560864,3.0,4997.517944,438.0,12.0,3.0,171.5,171.5


In [4]:
# checking for null values
print(df.isnull().sum().sum())

0


In [5]:
# checking for correlation
df.corr()

Unnamed: 0.1,Unnamed: 0,label,aon,daily_decr30,daily_decr90,rental30,rental90,last_rech_date_ma,last_rech_date_da,last_rech_amt_ma,...,cnt_loans30,amnt_loans30,maxamnt_loans30,medianamnt_loans30,cnt_loans90,amnt_loans90,maxamnt_loans90,medianamnt_loans90,payback30,payback90
Unnamed: 0,1.0,0.000403,-0.002048,0.002739,0.003077,-0.003906,-0.003459,-0.001853,-0.001133,-0.001064,...,0.001725,0.002387,0.000698,-0.002005,0.002241,0.000781,0.001742,-0.002615,-4e-05,0.002411
label,0.000403,1.0,-0.003785,0.168298,0.16615,0.058085,0.075521,0.003728,0.001711,0.131804,...,0.196283,0.197272,0.000248,0.044589,0.004733,0.199788,0.084144,0.035747,0.048336,0.049183
aon,-0.002048,-0.003785,1.0,0.001104,0.000374,-0.00096,-0.00079,0.001692,-0.001693,0.004256,...,-0.001826,-0.001726,-0.002764,0.004664,-0.000611,-0.002319,-0.001191,0.002771,0.00194,0.002203
daily_decr30,0.002739,0.168298,0.001104,1.0,0.977704,0.442066,0.458977,0.000487,-0.001636,0.275837,...,0.366116,0.471492,-2.8e-05,-0.01161,0.008962,0.563496,0.400199,-0.037305,0.026915,0.047175
daily_decr90,0.003077,0.16615,0.000374,0.977704,1.0,0.434685,0.47173,0.000908,-0.001886,0.264131,...,0.340387,0.447869,2.5e-05,-0.005591,0.009446,0.567204,0.397251,-0.034686,0.0194,0.0408
rental30,-0.003906,0.058085,-0.00096,0.442066,0.434685,1.0,0.955237,-0.001095,0.003261,0.127271,...,0.180203,0.233453,-0.000864,-0.016482,0.004012,0.298943,0.234211,-0.035489,0.072974,0.095147
rental90,-0.003459,0.075521,-0.00079,0.458977,0.47173,0.955237,1.0,-0.001688,0.002794,0.121416,...,0.171595,0.231906,-0.001411,-0.009467,0.005141,0.327436,0.251029,-0.034122,0.06711,0.099501
last_rech_date_ma,-0.001853,0.003728,0.001692,0.000487,0.000908,-0.001095,-0.001688,1.0,0.00179,-0.000147,...,0.001193,0.000903,0.000928,0.001835,-0.000225,0.00087,-0.001123,0.002771,-0.002233,-0.001583
last_rech_date_da,-0.001133,0.001711,-0.001693,-0.001636,-0.001886,0.003261,0.002794,0.00179,1.0,-0.000149,...,0.00038,0.000536,0.000503,6.1e-05,-0.000972,0.000519,0.001524,-0.002239,7.7e-05,0.000417
last_rech_amt_ma,-0.001064,0.131804,0.004256,0.275837,0.264131,0.127271,0.121416,-0.000147,-0.000149,1.0,...,-0.027612,0.008502,0.001,0.02837,9.3e-05,0.014067,0.14846,0.021004,-0.027369,-0.01426


In [6]:
df["pcircle"].nunique()

1

In [7]:
df['msisdn'].nunique()

186243

In [8]:
# dropping the unrequired columns
df=df.drop(
    labels=['Unnamed: 0','pcircle','pdate'],
    axis=1)

In [9]:
df.columns

Index(['label', 'msisdn', 'aon', 'daily_decr30', 'daily_decr90', 'rental30',
       'rental90', 'last_rech_date_ma', 'last_rech_date_da',
       'last_rech_amt_ma', 'cnt_ma_rech30', 'fr_ma_rech30',
       'sumamnt_ma_rech30', 'medianamnt_ma_rech30', 'medianmarechprebal30',
       'cnt_ma_rech90', 'fr_ma_rech90', 'sumamnt_ma_rech90',
       'medianamnt_ma_rech90', 'medianmarechprebal90', 'cnt_da_rech30',
       'fr_da_rech30', 'cnt_da_rech90', 'fr_da_rech90', 'cnt_loans30',
       'amnt_loans30', 'maxamnt_loans30', 'medianamnt_loans30', 'cnt_loans90',
       'amnt_loans90', 'maxamnt_loans90', 'medianamnt_loans90', 'payback30',
       'payback90'],
      dtype='object')

In [10]:
df.shape

(209593, 34)

In [11]:
# Checking for unbalanced data
df['label'].value_counts()

1    183431
0     26162
Name: label, dtype: int64

In [12]:
# Feature and label 
y=df['label']
X=df.drop(
    labels=['label','msisdn'],
    axis=1)
# X,y

In [13]:
#normalize here
X=sklearn.preprocessing.normalize(X, norm='l2',axis=0)

### Splitting into Train and Test Data

In [14]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [15]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(140427, 32) (69166, 32) (140427,) (69166,)


### Logistic Regression : Training and Evaluation

In [16]:
# Model - fit 
model=LogisticRegression()
model=model.fit(X_train,y_train)

In [17]:
# predict
y_predict=model.predict(X_test)
y_predict

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
model.score(X_train, y_train, sample_weight=None)

0.8758073589836712

In [19]:
model.score(X_test, y_test, sample_weight=None)

0.8738975797357083

### RandomForestClassifier : Training and Evaluation

In [20]:
model2=RandomForestClassifier(min_samples_split=10).fit(X_train,y_train)
y_predict=model2.predict(X_test)
set(y_predict)

{0, 1}

In [21]:
# To check overfitting
model2.score(X_train, y_train, sample_weight=None)

0.9533707905174931

In [22]:
model2.score(X_test, y_test, sample_weight=None)

0.9101003383165139

### Explanation and Conclusion

Steps followed to achieve the task
1. Data cleaning and Data preprocessing was done
2. Split the dat into training data and testing data(33%)
3. Tried out different ML Models 
     a. Logistic Regression : The accuracy was 87.5%. The model Was not powerful enough due to unbalanced data, the label '1' was present 87.5% of the time whereas the label '0' was present 12.5% of the time.
     b. Random Forest Classifier : The model worked well but I played around with the parameters to reduce overfitting by increasing the minimum samples split. Achieved an accuracy of 91%
4. Final Conclusion: Created a Deliquency Model with Random Forest Classifier with test accuracy 91%