**fixing drive files issues**

In [16]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

**Import data to Google colab**

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [18]:
file_list = drive.ListFile({'q': "'1Id1XeWx9UkqwMMc3kcanhLTpb8kCT5eb' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: CMPE 255 Group project.ipynb, id: 1UIBZMa3VLusVL-jylNPAX_OWhHtuakNW
title: CMPE255_Group3_CreditCardFraudDetection.pptx, id: 1NOIB4zmKrngOsoItdtlI0siQQLDeE8V1
title: data, id: 1r59iFHOBBz0-snPQZAEwFUB5v6llLJfj
title: train_transaction.csv, id: 1-ozFH9YKaNQwMnUFN8cZ0F1W1zDych01
title: sample_submission.csv, id: 1Pdi-XbWTYCRugsfvD6DP6O-Qo6xwrOsb
title: test_identity.csv, id: 1NyZqEME34-SQqejDgK526djGxeSNlOwh
title: train_identity.csv, id: 1xTCIWmmFA8IThN9uyEo4NGs21yOTkxIT
title: test_transaction.csv, id: 1Ap7C4MeEHVDTp0xVmohF0v7NzovS-27K


In [0]:
train_downloaded = drive.CreateFile({'id': '1-ozFH9YKaNQwMnUFN8cZ0F1W1zDych01'})
train_downloaded.GetContentFile('train_transaction.csv')
test_downloaded = drive.CreateFile({'id': '1xTCIWmmFA8IThN9uyEo4NGs21yOTkxIT'})
test_downloaded.GetContentFile('train_identity.csv')
test_downloaded = drive.CreateFile({'id': '1Ap7C4MeEHVDTp0xVmohF0v7NzovS-27K'})
test_downloaded.GetContentFile('test_transaction.csv')
test_downloaded = drive.CreateFile({'id': '1NyZqEME34-SQqejDgK526djGxeSNlOwh'})
test_downloaded.GetContentFile('test_identity.csv')



```
# This is formatted as code
```

**Import training set and test set into dataframe**

In [0]:
import pandas
import numpy as np
train_transaction = pandas.read_csv('train_transaction.csv')
train_identity = pandas.read_csv('train_identity.csv')
test_transaction = pandas.read_csv('test_transaction.csv')
test_identity = pandas.read_csv('test_identity.csv')
train_set = train_transaction.merge(train_identity,on='TransactionID',how='left')
test_set = test_transaction.merge(test_identity,on='TransactionID',how='left')
train_set.append(test_set)
df = train_set
df.describe().transpose()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


**Function to reduce memoery**

In [0]:
def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

**Applying memory reduction to our dataset**

In [8]:
reduce_mem(df)

Mem. usage decreased from 1959.88 Mb to 650.48 Mb (66.8% reduction)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.500000,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.000000,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2987002,0,86469,59.000000,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2987003,0,86499,50.000000,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2987004,0,86506,50.000000,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,0.0,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.000000,W,6550,,150.0,visa,226.0,debit,272.0,87.0,48.0,,,,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,3.0,2.0,29.0,29.0,30.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
590536,3577536,0,15811049,39.500000,W,10444,225.0,150.0,mastercard,224.0,debit,204.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
590537,3577537,0,15811079,30.953125,W,12037,595.0,150.0,mastercard,224.0,debit,231.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
590538,3577538,0,15811088,117.000000,W,7826,481.0,150.0,mastercard,224.0,debit,387.0,87.0,3.0,,aol.com,,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,1.0,5.0,1.0,22.0,22.0,0.0,22.0,0.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
df.head()

**Replace all null values**

In [10]:
df.isnull().any().sum()

414

In [12]:
df = df.fillna(0)
df.isnull().any().sum()

0

**Transform categorical features to numerical**

In [0]:
categorical_data = df.select_dtypes(include=[object])

In [14]:
print(categorical_data.head())

Empty DataFrame
Columns: [ProductCD, card4, card6, P_emaildomain, R_emaildomain, M1, M2, M3, M4, M5, M6, M7, M8, M9, id_12, id_15, id_16, id_23, id_27, id_28, id_29, id_30, id_31, id_33, id_34, id_35, id_36, id_37, id_38, DeviceType, DeviceInfo]
Index: []


In [0]:
df = df.drop(categorical_data.columns,axis = 1)

In [0]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,...,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_13,id_14,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
0,2987000,0,86400,68.5,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,315.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,-480.0,166.0,0.0,542.0,144.0,0.0,0.0,0.0,0.0,0.0,32.0


In [0]:
from sklearn import preprocessing
encoding = preprocessing.OneHotEncoder()
categorical_data.replace(0, "0", inplace=True)

le = preprocessing.LabelEncoder()
le_cat_data = categorical_data.apply(le.fit_transform)

le_cat_data.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_12,id_15,id_16,id_23,id_27,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,4,2,2,0,0,2,2,2,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4,3,2,17,0,0,0,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4,4,3,36,0,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,3,3,54,0,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,3,2,17,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,2,2,8,124,165,4,2,1,2,2,2,955


In [0]:
df = pandas.concat([df, le_cat_data], axis=1)

**Standardize some of our features**

In [0]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df['amount'] = StandardScaler().fit_transform(df['TransactionAmt'].values.reshape(-1,1))
df['time'] = StandardScaler().fit_transform(df['TransactionDT'].values.reshape(-1,1))

df = df.drop(['TransactionDT','TransactionAmt'], axis = 1)
df.head()

  sqr = np.multiply(arr, arr, out=arr)


Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,...,id_20,id_21,id_22,id_24,id_25,id_26,id_32,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_12,id_15,id_16,id_23,id_27,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,amount,time
0,2987000,0,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,2,2,0,0,2,2,2,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577987
1,2987001,0,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3,2,17,0,0,0,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577986
2,2987002,0,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,315.0,0.0,0.0,0.0,315.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4,3,36,0,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577972
3,2987003,0,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,111.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3,3,54,0,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577965
4,2987004,0,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,144.0,0.0,0.0,0.0,0.0,0.0,32.0,1,3,2,17,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,2,2,8,124,165,4,2,1,2,2,2,955,-0.0,-1.577964


In [0]:
X = df.drop(['isFraud'], axis = 1)
Y = df['isFraud']

**Applying PCA to get all features**

In [0]:
pca = PCA(n_components=432)
pComponents = pca.fit_transform(X.values)

In [0]:
eigenvalues = pca.explained_variance_
total_sum = eigenvalues.sum()

Calculating the top 50 PCA to understand the transformed features

In [0]:
for i in range(1, 50):
  total = eigenvalues[:i].sum()
  variance = total/total_sum
  print('total variance for {} PCA is : {}'.format(i, variance))

total variance for 1 PCA is : 0.6629360723761234
total variance for 2 PCA is : 0.9264209258326431
total variance for 3 PCA is : 0.9941266719977593
total variance for 4 PCA is : 0.9989472836005472
total variance for 5 PCA is : 0.9994913388371168
total variance for 6 PCA is : 0.9996313547957322
total variance for 7 PCA is : 0.9997203894034415
total variance for 8 PCA is : 0.9997873776749229
total variance for 9 PCA is : 0.9998431481518449
total variance for 10 PCA is : 0.9998898766453447
total variance for 11 PCA is : 0.9999155475881404
total variance for 12 PCA is : 0.9999290058566933
total variance for 13 PCA is : 0.9999395991924115
total variance for 14 PCA is : 0.9999474499751945
total variance for 15 PCA is : 0.9999540350811208
total variance for 16 PCA is : 0.9999599556503607
total variance for 17 PCA is : 0.9999655278624116
total variance for 18 PCA is : 0.9999698340001026
total variance for 19 PCA is : 0.9999733693783566
total variance for 20 PCA is : 0.9999759972513599
total var

The top 2 elements represent for 99% of data. 

In order to test our hypothesis, applied 30 pca to our dataset.

In [0]:
pca = PCA(n_components=30)
pComponents = pca.fit_transform(X.values)
pDf = pandas.DataFrame(data = pComponents
             , columns = ['pc_1', 'pc_2', 'pc_3', 'pc_4', 'pc_5','pc_6', 'pc_7','pc_8', 'pc_9', 'pc_10', 'pc_11', 'pc_12', 'pc_13', 'pc_14', 'pc_15', 'pc_16', 'pc_17', 'pc_18', 'pc_19', 'pc_20', 'pc_21', 'pc_22', 'pc_23', 'pc_24', 'pc_25', 'pc_26', 'pc_27', 'pc_28', 'pc_29', 'pca_30'])
finalDf = pandas.concat([pDf, Y], axis = 1)
finalDf.head()

Unnamed: 0,TransactionID,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,...,id_21,id_22,id_24,id_25,id_26,id_32,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_12,id_15,id_16,id_23,id_27,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,amount,time,isFraud
0,2987000,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,2,2,0,0,2,2,2,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577987,0
1,2987001,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,3,2,17,0,0,0,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577986,0
2,2987002,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,315.0,0.0,0.0,0.0,315.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,4,3,36,0,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577972,0
3,2987003,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,111.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,3,3,54,0,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577965,0
4,2987004,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,32.0,1,3,2,17,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,2,2,8,124,165,4,2,1,2,2,2,955,-0.0,-1.577964,0


**Visualizing the distribution of top 10 pca**

In [0]:
import matplotlib.gridspec as gridspec
from matplotlib.pyplot import figure
from matplotlib import pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D 
plt.figure(figsize=(12,28*4))
plt.style.use('ggplot')
final_features = finalDf.iloc[:,0:10].columns
grid = gridspec.GridSpec(10, 1)
for i, cn in enumerate(final_features):
    axis = plt.subplot(grid[i])
    sns.distplot(finalDf[cn][finalDf.isFraud == 1], bins=50)
    sns.distplot(finalDf[cn][finalDf.isFraud == 0], bins=50)
    axis.set_xlabel('histogram of feature: ' + str(cn))
plt.show()

In [0]:
#Resampling
finalDf.shape
finalDf = pandas.concat([X, Y], axis = 1)
finalDf.head()

(590540, 434)

**Splitting the data into test set and train set**

In [1]:
finalDf.describe().transpose()

NameError: ignored

In [0]:
finalDf_copy = finalDf.copy()
#Spliting the data set into train and test set.

train_set = finalDf_copy.sample(frac=0.80, random_state=0)
test_set = finalDf_copy.drop(train_set.index)
train_set.shape

In [0]:
#Class Count after resampling
class_count_resample=train_set['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
del finalDf_copy
del finalDf

In [0]:
#Now we have traing set lets count how many samples of each class do we have
df_isFraud = train_set[train_set.isFraud == 1]
df_notFraud = train_set[train_set.isFraud == 0]

In [0]:
df_isFraud.shape


In [0]:
df_notFraud.shape

**Down Sampling**

In [0]:
from sklearn.utils import resample
df_downsampled = resample(df_notFraud,
                                      replace=False,
                                      n_samples=df_isFraud.shape[0])    # to match minority class
print("After Down Sampling:\n",df_downsampled.shape,df_isFraud.shape)

**Upsampling**

In [0]:
from sklearn.utils import resample
df_upsampled = resample(df_isFraud, replace=True, n_samples=df_notFraud.shape[0], random_state=12 )    # to match majority class
print("After Up Sampling:\n",df_upsampled.shape,df_isFraud.shape)

In [0]:
from sklearn.utils import shuffle
df_train = shuffle(pandas.concat([df_downsampled,df_isFraud]))

In [0]:
df_train_up = shuffle(pandas.concat([df_upsampled,df_notFraud]))

In [0]:
#Class Count after resampling
class_count_resample=df_train['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
#Class Count after resampling
class_count_resample=df_train_up['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
df_train

In [0]:
#Now we have the balanced traing data set we can 
#Divide the data set into X and y
y_train = df_train.isFraud
X_train = df_train.drop('isFraud',axis=1)

y_test = test_set.isFraud
X_test = test_set.drop('isFraud',axis=1)


**Using Logistic Regression**
traning our Model

In [0]:

from sklearn.linear_model import LogisticRegression as logReg
model_LR = logReg().fit(X_train,y_train)

In [0]:
prediction_LR = model_LR.predict(X_test)

from sklearn.metrics import accuracy_score

#Testing the model for accuracy on test data set
print(accuracy_score(y_test,prediction_LR))

In [0]:
from sklearn import metrics
confusion_matrix_LR = metrics.confusion_matrix(y_test, prediction_LR)
confusion_matrix_LR

In [0]:
print("Accuracy:",metrics.accuracy_score(y_test, prediction_LR))
print("Precision:",metrics.precision_score(y_test, prediction_LR))
print("Recall:",metrics.recall_score(y_test, prediction_LR))

**Using Naive Bayes**
traning our Model

In [0]:

from sklearn.naive_bayes import GaussianNB
model_NB = GaussianNB().fit(X_train,y_train)
prediction_NB = model_NB.predict(X_test)
print(accuracy_score(y_test,prediction_NB))

**Random Forest**

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
model_RF = RandomForestRegressor(
    n_estimators=400, max_features=0.3,
    min_samples_leaf=20, n_jobs=-1, verbose=1)

In [0]:
model_RF.fit(X_train,y_train)

In [0]:
prediction_RF = model_RF.predict(X_test)

In [0]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, prediction_RF)

**SVM**

In [0]:
from sklearn.svm import LinearSVC
model_svm = LinearSVC()
model_svm.fit(X_train, y_train)

In [0]:
predict_SVM = model_svm.predict(X_test)

**KMeans**

In [0]:
from sklearn.cluster import KMeans

# Number of clusters
kmeans = KMeans(n_clusters=2)
# Fitting the input data
kmeans = kmeans.fit(X_train)
# Getting the cluster labels
predict_kmeans = kmeans.predict(X_test)
# Centroid values
centroids = kmeans.cluster_centers_

print(centroids) # From sci-kit learn

In [0]:
print(accuracy_score(y_test,predict_kmeans ))