In [37]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_recall_fscore_support

In [2]:
#load train data
train_data=[pd.read_csv('C:/Users/Ruchita/Desktop/loan dataset- ephesoft/train_v2.csv')]

print(train_data[0].describe())

target=[int(train_data[0]['loss'][i]!=0) for i in range(len(train_data[0]))]
train_data[0]['target']=target

#number of instances of each class
print('...')
print('Zero-loss instances : ',target.count(0))
print()
print('Non-zero loss instances : ',target.count(1))
print()

  interactivity=interactivity, compiler=compiler, result=result)


                  id             f1             f2             f3  \
count  105471.000000  105471.000000  105471.000000  105471.000000   
mean    52736.000000     134.603171       8.246883       0.499066   
std     30446.999458      14.725467       1.691535       0.288752   
min         1.000000     103.000000       1.000000       0.000006   
25%     26368.500000     124.000000       8.000000       0.248950   
50%     52736.000000     129.000000       9.000000       0.498267   
75%     79103.500000     148.000000       9.000000       0.749494   
max    105471.000000     176.000000      11.000000       0.999994   

                  f4             f5             f6             f7  \
count  105471.000000  105471.000000  105471.000000  105289.000000   
mean     2678.488874       7.354533   47993.704317    2974.336018   
std      1401.010943       5.151112   35677.136048    2546.551085   
min      1100.000000       1.000000       0.000000       1.000000   
25%      1500.000000       4.0000

In [3]:
# There are significant missing values in the data as well as class imbalance
# Also certain columns have mixed datatypes.. These have to be converted to integer values

#function to map all values to int
def map_int(x):
    try:
        y=int(x)
    except:
        y=x #in case of nan
    return y

columns = list(train_data[0].columns)
print('..ALL COLUMNS.. ',len(columns))
print()

# columns 135,204,274,417,462 have mixed datatypes i.e. numeric entries but int/str datatypes
# since these columns contain numeric entries we will map these to int datatype
mixed_datatypes=[135,204,274,417,462]
mixed_cols_names=[columns[i] for i in mixed_datatypes]
for col_name in mixed_cols_names:
    train_data[0][col_name]=train_data[0][col_name].map(lambda x:map_int(x))

#1. number of columns with missing values
missing_cols = [column for column in columns if train_data[0][column].count()!=len(train_data[0])]
print('..COLUMNS WITH MISSING VALUES.. ',len(missing_cols))
print()

# all datatypes
datatypes = list(set([type(train_data[0][column][0]) for column in columns]))
print('..UNIQUE DATATYPES..', len(datatypes))
print(datatypes)
print()

#2. types of features- continuous or categorical, numeric or string
numeric = [column for column in columns if type(train_data[0][column][0])!=str]
print('..Numerical columns.. ',len(numeric))
print()
string = [column for column in columns if type(train_data[0][column][0])==str]
print('..String datatype columns.. ',len(string))
print()
## string categorical or continuous
string_continuous=[column for column in string if len(train_data[0][column].unique())>50]
string_categorical=list(set(string).difference(set(string_continuous)))
print('..string continuous.. ', len(string_continuous))
print('..string categorical..', len(string_categorical))

print(train_data[0][string].head())

#all string datatype columns can be mapped to their integer values

for col_name in string:
    train_data[0][col_name]=train_data[0][col_name].map(lambda x:map_int(x))

#thus all columns now comprise of integers/floats with some missing values

print(train_data[0].describe())


..ALL COLUMNS..  772

..COLUMNS WITH MISSING VALUES..  525

..UNIQUE DATATYPES.. 4
[<class 'str'>, <class 'numpy.int64'>, <class 'int'>, <class 'numpy.float64'>]

..Numerical columns..  757

..String datatype columns..  15

..string continuous..  15
..string categorical.. 0
                      f138                f207                   f277  \
0    754485076006959972352   38600000000000000  683091368180479950848   
1        15300000000000000    1690000000000000    2140000000000000000   
2      6910365323840000000  389000000000000000      69200000000000000   
3  11225194901267999096832      35000000000000     295000000000000000   
4          108000000000000    1870000000000000      23100000000000000   

            f338                            f390  \
0  7610000000000   10370164393071999997033054208   
1      796594176    5098137566366599989877014528   
2   461000000000   26400269714792999161039945728   
3  1330000000000    9333818143939599917454983168   
4  2240000000000  19600466



                  id             f1             f2             f3  \
count  105471.000000  105471.000000  105471.000000  105471.000000   
mean    52736.000000     134.603171       8.246883       0.499066   
std     30446.999458      14.725467       1.691535       0.288752   
min         1.000000     103.000000       1.000000       0.000006   
25%     26368.500000     124.000000       8.000000       0.248950   
50%     52736.000000     129.000000       9.000000       0.498267   
75%     79103.500000     148.000000       9.000000       0.749494   
max    105471.000000     176.000000      11.000000       0.999994   

                  f4             f5             f6             f7  \
count  105471.000000  105471.000000  105471.000000  105289.000000   
mean     2678.488874       7.354533   47993.704317    2974.336018   
std      1401.010943       5.151112   35677.136048    2546.551085   
min      1100.000000       1.000000       0.000000       1.000000   
25%      1500.000000       4.0000

In [4]:
#dealing with missing values using mean substitution

for col in missing_cols:
    train_data[0].loc[train_data[0][col].isnull()==True,col]=train_data[0][col].mean()

In [5]:
# Class imbalance can be tackled using Oversampling Techniques

#step 1: divide the data into 10 smaller datasets maintaining the class imbalance

train_data[0]=train_data[0].iloc[np.random.permutation(range(len(train_data[0])))]
train_data[0]=train_data[0].reset_index(drop=True)

majority=[train_data[0][train_data[0]['target']==0]]
minority=[train_data[0][train_data[0]['target']==1]]

del train_data

k_maj=int(len(majority[0])/10)  #part size
k_min=int(len(minority[0])/10)  #part size

train_data_10=[pd.concat([majority[0][i*k_maj:(i+1)*k_maj],minority[0][i*k_min:(i+1)*k_min]]) for i in range(10)]
train_data_10.append(pd.concat([majority[0][9*k_maj:],minority[0][9*k_min:]]))

del majority
del minority

def shuffle_and_resetindex(df):
    df=df.iloc[np.random.permutation(range(len(df)))]
    df=df.iloc[np.random.permutation(range(len(df)))]
    df=df.reset_index(drop=True)
    return df




In [6]:
train_data_10=[shuffle_and_resetindex(x) for x in train_data_10]
train_data = train_data_10[:6]
test_data  = train_data_10[6:]
del train_data_10

## Instead of oversampling the entire data we will resample a fraction of the data (6/11) and test the models on the original unsampled data. 

In [7]:
#oversampling 

def oversample(X,Y,sampler):
    s=['smote','smoteb1','smoteb2','smotesvm','random','adasyn']
    print(sampler)
    if sampler==s[0]:
        sm=SMOTE(kind='regular')
    elif sampler==s[1]:
        sm=SMOTE(kind='borderline1')
    elif sampler==s[2]:
        sm=SMOTE(kind='borderline2')
    elif sampler==s[3]:
        sm=SMOTE(kind='svm')
    elif sampler==s[4]:
        sm=RandomOverSampler()
    else:
        sm=ADASYN()
    X_resampled,Y_resampled = sm.fit_sample(X,Y)
    return X_resampled,Y_resampled

features=list(train_data[0].columns)
features.remove('loss')
features.remove('target')
'''
.......
.......
CHANGE CODE HERE TO CHOOSE SAMPLING TECHNIQUE

samplers=['smote','smoteb1','smoteb2','random','adasyn','smote','smoteb1','smoteb2','random','adasyn','smotesvm']
xy_resampled=[oversample(train_data[i][features],train_data[i]['target'],samplers[i]) for i in range(len(train_data))]

.......
.......
'''
del train_data

smote
smoteb1
smoteb2
random
adasyn
smote


In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def feature_select(x,y,k):
    kb = SelectKBest(f_classif,k)
    kb.fit(x,y)
    x_new= kb.fit_transform(x,y)
    params=kb.get_params()
    scores=kb.scores_
    return x_new,scores


In [24]:
#kbest_xy=[(feature_select(a,b,100),b) for (a,b) in xy_resampled]
import operator
(a,b)=xy_resampled[0]
x_new,scores=feature_select(a,b,100)
dct=dict([[i,scores[i]] for i in range(len(scores))])
sorted_dct = sorted(dct.items(), key=operator.itemgetter(1))
c=sorted_dct[-100:]
top_features=[features[c[i][0]] for i in range(len(c))]
print(type(top_features))
(jx,jy)=xy_resampled[1]
jx=pd.DataFrame(jx,columns=features).head()
xx=jx[top_features]
kbest_train=[(pd.DataFrame(a,columns=features)[top_features],b) for (a,b) in xy_resampled]
kbest_test =[(a[top_features],a['target']) for a in test_data]
del test_data
del xy_resampled



<class 'list'>


In [25]:
#del xy_resampled
clfs=[RandomForestClassifier(n_estimators=50),DecisionTreeClassifier(),AdaBoostClassifier(),KNeighborsClassifier(),RandomForestClassifier(n_estimators=50),LogisticRegression()]
clf_names=['RANDOM FOREST','DECISION TREE','ADA BOOST','KNN','RANDOM FOREST','Logistic Regression']

def train(kbest_train,clfs,clf_names):
    for i in range(len(kbest_train)):
        (x,y)=kbest_train[i]
        clfs[i].fit(x,y)
    return clfs

trained_cls=train(kbest_train,clfs,clf_names)

In [44]:

def test(kbest_test,trained_cls,clf_names):
    for i in range(len(trained_cls)):
        print('.....')
        print(clf_names[i])
        scores=[]
        f_scores=[]
        for j in range(len(kbest_test)):
            (x,y)=kbest_test[j]
            prec,recall,f_score,d=precision_recall_fscore_support(y,trained_cls[i].predict(x),pos_label=1,average='binary')
            print('Dataset : ',j)
            print('Precision : ',prec,' ,Recall : ',recall,'F_score : ',f_score)
            print()     
            f_scores.append(f_score)
            scores.append(trained_cls[i].score(x,y))
        print('Avg. performance : ',sum(scores)/len(scores))
        print('Avg. f_score : ',sum(f_scores)/len(f_scores))
        print()
        print('.....')


In [45]:
test(kbest_test,trained_cls,clf_names)

.....
RANDOM FOREST
Dataset :  0
Precision :  0.181818181818  ,Recall :  0.0122699386503 F_score :  0.0229885057471

Dataset :  1
Precision :  0.132075471698  ,Recall :  0.00715746421268 F_score :  0.0135790494665

Dataset :  2
Precision :  0.0961538461538  ,Recall :  0.00511247443763 F_score :  0.00970873786408

Dataset :  3
Precision :  0.0877192982456  ,Recall :  0.00511247443763 F_score :  0.00966183574879

Dataset :  4
Precision :  0.0877192982456  ,Recall :  0.00509683995923 F_score :  0.00963391136802

Avg. performance :  0.903111533048
Avg. f_score :  0.0131144080389

.....
.....
DECISION TREE
Dataset :  0
Precision :  0.113584036838  ,Recall :  0.151329243354 F_score :  0.129767645769

Dataset :  1
Precision :  0.112915699923  ,Recall :  0.149284253579 F_score :  0.128577719066

Dataset :  2
Precision :  0.102290076336  ,Recall :  0.137014314928 F_score :  0.117132867133

Dataset :  3
Precision :  0.112547528517  ,Recall :  0.151329243354 F_score :  0.12908853031

Dataset :  4