In [2]:
import pandas as pd 
import numpy as np
import statsmodels.formula.api as smf

data_filename = 'C:/Users/User/Desktop/Practical_DS_Leon/Assign_3/data/creditcard.csv'
df = pd.read_csv(data_filename)
#There are not 'NaNs' in rows
df = df.dropna()

In [3]:
#As we can see we have highly imbalanced dataset
from collections import Counter
print('Dataset normal transactions vs frauds {}'.format(Counter(df['Class'])))

Dataset normal transactions vs frauds Counter({0: 284315, 1: 492})


Some general discussion:

Owing to such imbalance in data, an algorithm that does no feature analysis and predicts all the transactions as non-frauds will  achieve an accuracy of 99.828%. Hence, accuracy is not a correct measurement of efficiency in this case.

'Time' feature does not indicate the actual time of the transaction but is listing the data in a chronological order. So we assume that 'Time' feature has no significance in classifying a transaction as a fraud. Hence, we eliminate this column from the analysis

In [5]:
from sklearn.preprocessing import StandardScaler
# Removing 'Time' column from the data as it has no significance in model
df = df.drop('Time', axis = 1, errors = 'ignore')

#standardize the 'Ammount' feuture because all the other feutures have values arround 0
df['Amount']= StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

Resampling the dataset:

These are techniques that will process the data to have an approximate 50-50 ratio.

One way to achieve this is OVER-sampling, this technique adds copies of the under-represented class(better when you have little data).

Another is UNDER-sampling, this technique deletes instances from the over-represented class (better when we have a lot of data).
In our analysis we will use both balancing techniques so we can compare the results

In [6]:
#UNDER-SAMPLE DATAFRAME
# Number of data points in the minority class
number_records_fraud = len(df[df['Class'] == 1])
fraud_indices = df[df['Class'] == 1].index

# Picking the indices of the normal classes
normal_indices = df[df['Class'] == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)

# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

# Under sample dataset
df_under_sample = df.iloc[under_sample_indices,:]

# Showing ratio
print("Percentage of normal transactions: ", len(df_under_sample[df_under_sample['Class'] == 0])/len(df_under_sample))
print("Percentage of fraud transactions: ", len(df_under_sample[df_under_sample['Class'] == 1])/len(df_under_sample))
print("Total number of transactions in resampled data: ", len(df_under_sample))

Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  984


In [7]:
#OVER-SAMPLE DATAFRAME

# Number of data points in the majority class
number_records_not_fraud = len(df[df['Class'] == 0])

fraud_indices = df[df['Class'] == 1].index
normal_indices = df[df['Class'] == 0].index

# Picking the indices of the fraud classes
fraud_indices = df[df['Class'] == 1].index

# Out of the indices we picked, randomly select "x" number (number_records_not_fraud)
random_fraud_indices = np.random.choice(fraud_indices, number_records_not_fraud, replace = True)

# Appending the 2 indices
over_sample_indices = np.concatenate([normal_indices, random_fraud_indices])

# Over sample dataset
df_over_sample = df.iloc[over_sample_indices, :]

# Showing ratio
print("Percentage of normal transactions: ", len(df_over_sample[df_over_sample['Class'] == 0])/len(df_over_sample))
print("Percentage of fraud transactions: ", len(df_over_sample[df_over_sample['Class'] == 1])/len(df_over_sample))
print("Total number of transactions in resampled data: ", len(df_over_sample))

Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  568630


MODEL: LOGISTIC REGRESSION

Logistic regression calculate the odd ratio of a transaction between the amount of frauds to non-frauds.
Consequently, we have to set a threshold that a transcaction will be considered as a fraud.

We will try logistic regression on Imbalance, Under Sampled and Over Sampled dataframes so we can compare the results.

In [11]:
# Inbalanced model
formula = ('Class ~  V4 + V5 + V8 + V10 + \
       V13 + V14 + V16 + V20 + \
       V21 + V22 + V23 + Amount')
model = smf.logit(formula, data=df)
results = model.fit()
odds = results.fittedvalues.apply(lambda x: np.exp(x)).to_frame()
df['odds'] = odds
results.summary()

Optimization terminated successfully.
         Current function value: 0.003988
         Iterations 12


0,1,2,3
Dep. Variable:,Class,No. Observations:,284807.0
Model:,Logit,Df Residuals:,284794.0
Method:,MLE,Df Model:,12.0
Date:,"Sat, 13 Jan 2018",Pseudo R-squ.:,0.6863
Time:,04:16:00,Log-Likelihood:,-1135.9
converged:,True,LL-Null:,-3621.2
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.5209,0.123,-69.282,0.000,-8.762,-8.280
V4,0.6240,0.044,14.292,0.000,0.538,0.710
V5,0.0830,0.026,3.144,0.002,0.031,0.135
V8,-0.1957,0.020,-9.878,0.000,-0.235,-0.157
V10,-0.4916,0.049,-10.085,0.000,-0.587,-0.396
V13,-0.3037,0.078,-3.914,0.000,-0.456,-0.152
V14,-0.6903,0.036,-19.023,0.000,-0.761,-0.619
V16,-0.3108,0.053,-5.877,0.000,-0.414,-0.207
V20,-0.0579,0.033,-1.734,0.083,-0.123,0.008


In [9]:
#Over-Sample logistic-regression
formula = ('Class ~  V1 + V2+ V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + \
       V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 + \
       V21 + V22 + V23 + V24 + V25 + V26 + V28 + Amount')

#We used all the feutures and we observed that V27 wasn't statistically signicant correlated with the dependend variable
#We removed V27 and we run the model again
#Now all the variables are statistically significant for our model
#It is possible sometimes when we remove one variables the p-values of other feutures to change so we have to rexamine our model!
model = smf.logit(formula, data=df_over_sample)
results = model.fit()
#'Un-logarithm' the results so we can intepret them better
odds = results.fittedvalues.apply(lambda x: np.exp(x)).to_frame()

df_over_sample['odds'] = odds
results.summary()

Optimization terminated successfully.
         Current function value: 0.137045
         Iterations 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


0,1,2,3
Dep. Variable:,Class,No. Observations:,568630.0
Model:,Logit,Df Residuals:,568601.0
Method:,MLE,Df Model:,28.0
Date:,"Sat, 13 Jan 2018",Pseudo R-squ.:,0.8023
Time:,04:12:34,Log-Likelihood:,-77928.0
converged:,True,LL-Null:,-394140.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.4683,0.013,-271.871,0.000,-3.493,-3.443
V1,0.6363,0.014,44.495,0.000,0.608,0.664
V2,0.6202,0.020,31.064,0.000,0.581,0.659
V3,0.4030,0.011,37.572,0.000,0.382,0.424
V4,0.7693,0.007,111.723,0.000,0.756,0.783
V5,0.7101,0.016,43.873,0.000,0.678,0.742
V6,-0.5642,0.011,-50.628,0.000,-0.586,-0.542
V7,-0.6031,0.019,-31.706,0.000,-0.640,-0.566
V8,-0.3979,0.007,-58.417,0.000,-0.411,-0.385


In [12]:
#Under-Sample logistic-regression
#after some trial and error we kept only the feutures that their correlations are statistically significant in our model.
formula = ('Class ~  V4 + V5 + V8 + V10 + \
       V13 + V14 + V20 + \
       V22 + V23 + Amount')

model = smf.logit(formula, data=df_under_sample)
results = model.fit()
#'Un-logarithm' the results so we can intepret them better
odds = results.fittedvalues.apply(lambda x: np.exp(x)).to_frame()
df_under_sample['odds'] = odds
results.summary()

Optimization terminated successfully.
         Current function value: 0.159195
         Iterations 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


0,1,2,3
Dep. Variable:,Class,No. Observations:,984.0
Model:,Logit,Df Residuals:,973.0
Method:,MLE,Df Model:,10.0
Date:,"Sat, 13 Jan 2018",Pseudo R-squ.:,0.7703
Time:,04:17:44,Log-Likelihood:,-156.65
converged:,True,LL-Null:,-682.06
,,LLR p-value:,2.1029999999999998e-219

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.0567,0.229,-13.350,0.000,-3.505,-2.608
V4,0.9062,0.108,8.373,0.000,0.694,1.118
V5,0.0668,0.088,0.759,0.448,-0.106,0.239
V8,-0.2787,0.081,-3.436,0.001,-0.438,-0.120
V10,-0.5243,0.167,-3.144,0.002,-0.851,-0.197
V13,-0.3577,0.161,-2.222,0.026,-0.673,-0.042
V14,-0.8245,0.137,-6.031,0.000,-1.092,-0.557
V20,-0.4134,0.216,-1.916,0.055,-0.836,0.010
V22,0.2578,0.203,1.272,0.203,-0.140,0.655


MODELS EVALUATION

Acuracy Score:

As mentioned above, accuracy score is a useless evaluation for our imbalance dataframe but it may be useful on the resampled dfs.

Precision and Recall:

Precision denotes the probability that a transaction that is classified as fraud is truly a fraud.
Recall (aka. True Positive Rate) is the probability that a true fraud is recognized by the classifier.

Precision: Out of the frauds we found how many are real frauds. Frauds identified / Actual frauds indentified 

Recall : How many of the real frauds we found. Actual frauds identidied / Total Frauds

It is obvious that there is a tradeoff between those two definition.
If we want recall = 1 then probably we have to increase the number of transactions predicted as frauds.

In our case, because we are predicting credit frauds, recall maybe is more important because we want to avoid frauds.
On the other hand, we do not want to block too many transactions that we are not sure about their fraudment and make user's life difficult.

In [24]:
#Custom function to calculate precision and recalls for different thresholds
def model_evaluation(df , odd_threshold):
    #we determine the transactions' prediction as fraud or non-fraud depending on the threshold that have been given.
    df['class_predict'] = df['odds'].apply(lambda odd: 1 if odd > odd_threshold else 0)
    #count the number of transactions that have been predicted as frauds
    frauds_identified = df['class_predict'].sum()
    #count the number of frauds that have been predicted correctly (True-Positive) 
    actual_frauds_ident = len(df.loc[(df['Class'] == 1) & (df['class_predict'] == 1), :])
    #count the number of all the actual frauds
    total_actual_frauds = df['Class'].sum()
    #count the false_positives
    false_positive = len(df.loc[(df['Class'] == 0) & (df['class_predict'] == 1), :])
    #count the false_negatives
    false_negative = len(df.loc[(df['Class'] == 1) & (df['class_predict'] == 0), :])
    
    
    Accuracy = len(df.loc[df['Class'] == df['class_predict'], :])/ len(df)
    Precision = actual_frauds_ident / frauds_identified
    Recall = actual_frauds_ident / total_actual_frauds
    
    print('Precision: ',format(Precision, '.3f'))
    print('Recall:    ', format(Recall, '.3f'))
    print('Accuracy:  ', format(Accuracy, '.3f'))
    print('False-Positive: ', false_positive)
    print('False-Negative: ', false_negative)

In [29]:
#print the results so we can compare them
#give different thresholds to compare the tradeoffs between precision/recall-false_positive/false_negative
print('Imbalanced Dataframe')
model_evaluation(df , 1.5)
print()
print('Over Sampled Dataframe')
model_evaluation(df_over_sample , 1.5)
print()
print('Under Sample Dataframe')
model_evaluation(df_under_sample, 1.5)

Imbalanced Dataframe
Precision:  0.891
Recall:     0.600
Accuracy:   0.999
False-Positive:  36
False-Negative:  197

Over Sampled Dataframe
Precision:  0.983
Recall:     0.913
Accuracy:   0.949
False-Positive:  4470
False-Negative:  24795

Under Sample Dataframe
Precision:  0.984
Recall:     0.902
Accuracy:   0.944
False-Positive:  7
False-Negative:  48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


As we explained above, on the imbalanced df the accuracy is extremly hight but the recall and precision are bad!!!

On the other two dataframes the results look really good. We can tune the threshold to find the balance point between precision and recall that we prefer.

We should remember that in this exercise we use the same df for trainig and testing. This is not a good practise because of overfitting.