# FRAUD TRANSACTION DETECTION
- Objective: To build a machine learning model to detect potentially fraudulent transactions.
- Data: Credit card transaction data from Kaggle.

### 1. Import required modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import time
from sklearn.model_selection import train_test_split
from random import sample, seed
from sklearn.ensemble import RandomForestClassifier

### 2. Create a few functions for later use

In [2]:
# A function to encode the error messages under
def encode_error(error_message, errors_list):
    if type(error_message)==float:
        return 0
    else:
        code = 0
        for i in errors_list:
            if i in error_message:
                code += 10**(errors_list.index(i))
        return code

In [3]:
# A function to convert data stored as currency into numerical
def strip_dollar_sign(string):
    return float(string.replace('$', ''))

In [4]:
# A function to convert time into minutes (i.e., minutes from midnight). E.g. 01:10 would be 70
def time_to_min(string):
    return 60*int(string[:2]) + int(string[-2:])

### 3. Import raw data

In [5]:
transactions = pd.read_csv('credit_card_transactions-ibm_v2.csv')
print(transactions.shape)
transactions.head()

(24386900, 15)


Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [6]:
cards = pd.read_csv('sd254_cards.csv')
print(cards.shape)
cards.head()

(6146, 13)


Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No


In [7]:
users = pd.read_csv('sd254_users.csv')
print(users.shape)
users.head()

(2000, 18)


Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,$37891,$77254,$191349,701,5
2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,$22681,$33483,$196,698,5
3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,$163145,$249925,$202328,722,4
4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,$53797,$109687,$183855,675,1


### 4. Defining variables

In [8]:
X = transactions.drop(columns='Is Fraud?')
y = transactions['Is Fraud?']

### 5. Data pre-processing
#### 5.1 Checking for missing values

In [9]:
print('NUMBER OF UNIQUE ENTRIES')
for col in X.columns:
    print(col, f'{len(X[col].unique()):,}')

NUMBER OF UNIQUE ENTRIES
User 2,000
Card 9
Year 30
Month 12
Day 31
Time 1,440
Amount 98,953
Use Chip 3
Merchant Name 100,343
Merchant City 13,429
Merchant State 224
Zip 27,322
MCC 109
Errors? 24


In [10]:
# Starting with columns with few unique values; that can easily be observed visually
print('UNIQUE ENTRIES')
for col in X.columns:
    if (len(X[col].unique()) < 25):
        print(col, f'{X[col].unique()}')

UNIQUE ENTRIES
Card [0 1 2 3 4 5 6 7 8]
Month [ 9 10 11 12  1  2  3  4  5  6  7  8]
Use Chip ['Swipe Transaction' 'Online Transaction' 'Chip Transaction']
Errors? [nan 'Technical Glitch' 'Insufficient Balance' 'Bad PIN'
 'Bad PIN,Insufficient Balance' 'Bad Expiration'
 'Bad PIN,Technical Glitch' 'Bad Card Number' 'Bad CVV' 'Bad Zipcode'
 'Insufficient Balance,Technical Glitch'
 'Bad Card Number,Insufficient Balance' 'Bad Card Number,Bad CVV'
 'Bad CVV,Insufficient Balance' 'Bad Card Number,Bad Expiration'
 'Bad Expiration,Bad CVV' 'Bad Expiration,Insufficient Balance'
 'Bad Expiration,Technical Glitch'
 'Bad Card Number,Bad Expiration,Technical Glitch'
 'Bad CVV,Technical Glitch' 'Bad Card Number,Technical Glitch'
 'Bad Zipcode,Insufficient Balance' 'Bad Zipcode,Technical Glitch'
 'Bad Card Number,Bad Expiration,Insufficient Balance']


<div class="alert alert-block alert-info">

#### Notes/Observations
- Columns **`Card`**, **`Month`**, and **`User Chip`**, are okay
- Column **`Errors?`** has missing values but this could mean there were no errors for those entries.
    - No further action is needed regarding missing values.
    - We'll encode the error messages later.

In [11]:
# transactions['Errors'] = transactions['Errors?'].apply(lambda x: 'None' if type(x)==float else x)
# transactions.drop(columns='Errors?', inplace=True)
# transactions.head()

- [ ] User
- [x] Card
- [ ] Year
- [x] Month
- [ ] Day
- [ ] Time
- [ ] Amount
- [x] Use Chip
- [ ] Merchant Name
- [ ] Merchant City
- [ ] Merchant State
- [ ] Zip
- [ ] MCC
- [x] Errors?

In [12]:
# Column "User"
assert sum([i>=0 and i<=2000 for i in X['User'].unique()]) == users.shape[0]
# Okay. No further action needed

In [13]:
# Column "Year"
X['Year'].unique()
# Okay. No further action needed

array([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 1999, 2000, 2001,
       1998, 1996, 1997, 1995, 1994, 1991, 1992, 1993], dtype=int64)

In [14]:
# Column "Day"
X['Day'].unique()
# Okay. No further action needed

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [15]:
# Column "Time"
assert sum([type(i)==str for i in X['Time'].unique()]) == len(X['Time'].unique())
# Okay. No further action needed

In [16]:
# Column "Amount"
assert sum([type(i)==str for i in X['Amount'].unique()]) == len(X['Amount'].unique())
# Okay. No further action needed

<div class="alert alert-block alert-info">
    
#### Notes/Observations
- We'll later convert entries in this column **`Amount`** from string into datetime elements.

- [x] User
- [x] Card
- [x] Year
- [x] Month
- [x] Day
- [x] Time
- [x] Amount
- [x] Use Chip
- [ ] Merchant Name
- [ ] Merchant City
- [ ] Merchant State
- [ ] Zip
- [ ] MCC
- [x] Errors?

In [17]:
# Column "Merchant Name"
assert len(X['Merchant Name'].unique()) == sum([type(i)==np.int64 for i in X['Merchant Name'].unique()])
# Okay. No further action needed

In [18]:
# Column "Merchant City"
assert sum([type(i)==str for i in X['Merchant City'].unique()]) == len(X['Merchant City'].unique())
# Okay. No further action needed

In [19]:
# Column "Merchant State"
# assert len(X['Merchant State'].unique()) == sum([type(i)==str for i in X['Merchant State'].unique()])
len(X['Merchant State'].unique()) == sum([type(i)==str for i in X['Merchant State'].unique()])

False

- There seems to be missing values in column **`Merchant State`**. Let's investigate

In [20]:
len(X['Merchant State'][[type(i)!=str for i in X['Merchant State']]])

2720821

- There are 2,720,821 missing state names.
- We fill them in by matching with city names.

In [21]:
X['Merchant City'][[type(i)!=str for i in X['Merchant State']]].unique()

array(['ONLINE'], dtype=object)

- Upon further checking, we realize that all orders that are missing State name are online orders.
- Let's just fill them in with "ONLINE"

In [22]:
X['Merchant_State'] = X['Merchant State'].apply(lambda x: 'ONLINE' if type(x)!=str else x)
X.drop(columns='Merchant State', inplace=True)
assert len(X['Merchant_State'].unique()) == sum([type(i)==str for i in X['Merchant_State'].unique()])
# All good now.

- [x] User
- [x] Card
- [x] Year
- [x] Month
- [x] Day
- [x] Time
- [x] Amount
- [x] Use Chip
- [x] Merchant Name
- [x] Merchant City
- [x] Merchant State
- [ ] Zip
- [ ] MCC
- [x] Errors?

In [23]:
# Column "Zip"
assert len(X['Zip'].unique()) == sum([type(i)==np.float64 for i in X['Zip'].unique()])
# Okay. No further action needed

In [24]:
# Column "MMC"
assert len(X['MCC'].unique()) == sum([type(i)==np.int64 for i in X['MCC'].unique()])
# Okay. No further action needed

In [25]:
# Checking the y variable.
y.unique()
# Okay. No further action needed

array(['No', 'Yes'], dtype=object)

- [x] User
- [x] Card
- [x] Year
- [x] Month
- [x] Day
- [x] Time
- [x] Amount
- [x] Use Chip
- [x] Merchant Name
- [x] Merchant City
- [x] Merchant State
- [x] Zip
- [x] MCC
- [x] Errors?

#### 5.2 Encoding nominal variables

In [26]:
le = LabelEncoder()

In [27]:
# y variable
y_fit = le.fit_transform(y)
print({y_fit[i]:y[i] for i in range(len(y))})
y = y_fit.copy()

{0: 'No', 1: 'Yes'}


- No : 0
- Yes : 1

In [28]:
# Column "Use Chip"
use_chip = le.fit_transform(X['Use Chip'])
print({use_chip[i]:X['Use Chip'][i] for i in range(X.shape[0])})
X['Use_Chip'] = use_chip
X.drop(columns='Use Chip', inplace=True)
X.head()

{2: 'Swipe Transaction', 1: 'Online Transaction', 0: 'Chip Transaction'}


Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Merchant Name,Merchant City,Zip,MCC,Errors?,Merchant_State,Use_Chip
0,0,0,2002,9,1,06:21,$134.09,3527213246127876953,La Verne,91750.0,5300,,CA,2
1,0,0,2002,9,1,06:42,$38.48,-727612092139916043,Monterey Park,91754.0,5411,,CA,2
2,0,0,2002,9,2,06:22,$120.34,-727612092139916043,Monterey Park,91754.0,5411,,CA,2
3,0,0,2002,9,2,17:45,$128.95,3414527459579106770,Monterey Park,91754.0,5651,,CA,2
4,0,0,2002,9,3,06:23,$104.71,5817218446178736267,La Verne,91750.0,5912,,CA,2


In [29]:
# Column "Errors"
errors = ','.join(X['Errors?'].unique()[1:])
errors = sorted(list(set(errors.split(',')))) # List of all error messages, exclusing nan
X['Error_Code'] = X['Errors?'].apply(encode_error, errors_list=errors)
print(X[['Error_Code', 'Errors?']].drop_duplicates().sort_values(by='Error_Code').reset_index().drop(columns='index'))
X.drop(columns = 'Errors?', inplace=True)
X.head()

    Error_Code                                            Errors?
0            0                                                NaN
1            1                                            Bad CVV
2           10                                    Bad Card Number
3           11                            Bad Card Number,Bad CVV
4          100                                     Bad Expiration
5          101                             Bad Expiration,Bad CVV
6          110                     Bad Card Number,Bad Expiration
7         1000                                            Bad PIN
8        10000                                        Bad Zipcode
9       100000                               Insufficient Balance
10      100001                       Bad CVV,Insufficient Balance
11      100010               Bad Card Number,Insufficient Balance
12      100100                Bad Expiration,Insufficient Balance
13      100110  Bad Card Number,Bad Expiration,Insufficient Ba...
14      10

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Merchant Name,Merchant City,Zip,MCC,Merchant_State,Use_Chip,Error_Code
0,0,0,2002,9,1,06:21,$134.09,3527213246127876953,La Verne,91750.0,5300,CA,2,0
1,0,0,2002,9,1,06:42,$38.48,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0
2,0,0,2002,9,2,06:22,$120.34,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0
3,0,0,2002,9,2,17:45,$128.95,3414527459579106770,Monterey Park,91754.0,5651,CA,2,0
4,0,0,2002,9,3,06:23,$104.71,5817218446178736267,La Verne,91750.0,5912,CA,2,0


#### 5.3 Time conversion

In [30]:
'''

# Confirming that all recorded times have length 5, i.e., they're stored in the format "HH:MM".
assert sum([len(i)==5 for i in X['Time'].unique()]) == len(X['Time'].unique())

# The conversion
X['Time_1'] = X['Time'].apply(time.fromisoformat)
X.drop(columns='Time', inplace=True)
X.rename(columns={'Time_1':'Time'}, inplace=True)
X.head()

'''

'\n\n# Confirming that all recorded times have length 5, i.e., they\'re stored in the format "HH:MM".\nassert sum([len(i)==5 for i in X[\'Time\'].unique()]) == len(X[\'Time\'].unique())\n\n# The conversion\nX[\'Time_1\'] = X[\'Time\'].apply(time.fromisoformat)\nX.drop(columns=\'Time\', inplace=True)\nX.rename(columns={\'Time_1\':\'Time\'}, inplace=True)\nX.head()\n\n'

In [31]:
# Confirming that all recorded times have length 5, i.e., they're stored in the format "HH:MM".
assert sum([len(i)==5 for i in X['Time'].unique()]) == len(X['Time'].unique())

# The conversion
X['Time_min'] = X['Time'].apply(time_to_min)
X.drop(columns='Time', inplace=True)
X.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Merchant Name,Merchant City,Zip,MCC,Merchant_State,Use_Chip,Error_Code,Time_min
0,0,0,2002,9,1,$134.09,3527213246127876953,La Verne,91750.0,5300,CA,2,0,381
1,0,0,2002,9,1,$38.48,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0,402
2,0,0,2002,9,2,$120.34,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0,382
3,0,0,2002,9,2,$128.95,3414527459579106770,Monterey Park,91754.0,5651,CA,2,0,1065
4,0,0,2002,9,3,$104.71,5817218446178736267,La Verne,91750.0,5912,CA,2,0,383


#### 5.4 Currency format conversion

In [32]:
X['Amount_$'] = X['Amount'].apply(strip_dollar_sign)
X.drop(columns='Amount', inplace=True)
X.head()

Unnamed: 0,User,Card,Year,Month,Day,Merchant Name,Merchant City,Zip,MCC,Merchant_State,Use_Chip,Error_Code,Time_min,Amount_$
0,0,0,2002,9,1,3527213246127876953,La Verne,91750.0,5300,CA,2,0,381,134.09
1,0,0,2002,9,1,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0,402,38.48
2,0,0,2002,9,2,-727612092139916043,Monterey Park,91754.0,5411,CA,2,0,382,120.34
3,0,0,2002,9,2,3414527459579106770,Monterey Park,91754.0,5651,CA,2,0,1065,128.95
4,0,0,2002,9,3,5817218446178736267,La Verne,91750.0,5912,CA,2,0,383,104.71


#### 5.5 Add some more variables that I think can contribute to the probability of having a fraudelent transaction
- Whether card has chip. (**`Has Chip`** from `cards` dataset)
- The number of cards that have ever been issued under the same card number. (**`Cards Issued`** from `cards` dataset)
- Whether the card is on the dark web (**`Card on Dark Web`** from `cards` dataset)
- Whether the transaction is taking place out outside the owner's state of residence. (compare **`State`** from `users` dataset that in the transactions dataset).

In [33]:
cards.head()

Unnamed: 0,User,CARD INDEX,Card Brand,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,1,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,0,2,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,0,3,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,0,4,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No


In [34]:
print('Primary Key for CARDS dataset:')
[col for col in cards.columns if len(cards[col].unique())==cards.shape[0]]

Primary Key for CARDS dataset:


['Card Number']

In [35]:
users.head()

Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,$37891,$77254,$191349,701,5
2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,$22681,$33483,$196,698,5
3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,$163145,$249925,$202328,722,4
4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,$53797,$109687,$183855,675,1


In [36]:
print('Primary Key for USERS dataset:')
[col for col in users.columns if len(users[col].unique())==users.shape[0]]

Primary Key for USERS dataset:


[]

- Just realized it's not gonna be possible to add the extra variables because there are no primary keys in `cards` and `users` that match a column in the transaction data.
- Proceed to trim `X` so as to have only the required features.

In [37]:
X.drop(columns=['Merchant City', 'Merchant_State'], inplace=True)
X.head()

Unnamed: 0,User,Card,Year,Month,Day,Merchant Name,Zip,MCC,Use_Chip,Error_Code,Time_min,Amount_$
0,0,0,2002,9,1,3527213246127876953,91750.0,5300,2,0,381,134.09
1,0,0,2002,9,1,-727612092139916043,91754.0,5411,2,0,402,38.48
2,0,0,2002,9,2,-727612092139916043,91754.0,5411,2,0,382,120.34
3,0,0,2002,9,2,3414527459579106770,91754.0,5651,2,0,1065,128.95
4,0,0,2002,9,3,5817218446178736267,91750.0,5912,2,0,383,104.71


### 6. Train-test split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(17070830, 12) (7316070, 12) (17070830,) (7316070,)


### 7. Handling imbalanced classes

In [39]:
print(f'Fraudulent :{sum(y==1)/len(y):.2%} \nNot Fraudulent :{sum(y==0)/len(y):.2%}')

Fraudulent :0.12% 
Not Fraudulent :99.88%


In [40]:
print(f'Train Set:\nFraudulent:{sum(y_train==1):,}\nNot Fraudulent:{sum(y_train==0):,}')

Train Set:
Fraudulent:20,848
Not Fraudulent:17,049,982


<div class="alert alert-block alert-info">

### Notes
- Sample about 21,000 of non-fraudulent transactions and combine these with the 20,848 fraudulent ones.
- Repeat 5 times with different non-fraudulent samples, but combined with the same fraudulent set.
- Select the best performing model tested against the full training and test sets.

In [41]:
# Get the sample row numbers/indeces
X_fraud = X_train[y_train==1]
X_non_fraud = X_train[y_train==0]
X_indices = X_non_fraud.index.tolist()
for i in range(5):
    seed(i+1)
    globals()['sample%s'%(i+1)] = sample(X_indices, k=21000)

sample1

[10094879,
 9721988,
 2028951,
 14948479,
 11001830,
 4536676,
 3255707,
 21543611,
 806416,
 4334374,
 4961889,
 9387124,
 17496705,
 4926503,
 12949129,
 17534719,
 12252014,
 4546370,
 8174456,
 4956772,
 11210244,
 16105594,
 8376381,
 15165518,
 16011068,
 9826806,
 21849483,
 9023451,
 3993737,
 18714993,
 624824,
 11802501,
 4112213,
 2954387,
 13725450,
 8970329,
 21457117,
 24085573,
 3180991,
 16393358,
 21794956,
 21899135,
 23958775,
 12146324,
 19530966,
 8280028,
 15744645,
 6028625,
 10822242,
 16462360,
 6503700,
 16733087,
 3952891,
 14050984,
 4059682,
 16270559,
 20371686,
 3871735,
 11146614,
 12095797,
 21048398,
 11771868,
 9995240,
 23328843,
 4266943,
 6334828,
 1299878,
 19587871,
 10441624,
 6807453,
 22443005,
 10710448,
 15532439,
 19405084,
 19858411,
 13806295,
 13900002,
 17426396,
 9076247,
 10218299,
 11337652,
 5996128,
 10795086,
 16152408,
 18043526,
 18552543,
 10452699,
 13110007,
 9364905,
 17851191,
 13344356,
 21283990,
 11715711,
 18984783,
 44

In [42]:
# Get the non-fraud samples
for i in range(5):
    globals()['X_non_fraud_%s'%(i+1)] = X_train.loc[globals()['sample'+str(i+1)]]

X_non_fraud_1.head()

Unnamed: 0,User,Card,Year,Month,Day,Merchant Name,Zip,MCC,Use_Chip,Error_Code,Time_min,Amount_$
10094879,849,3,2006,5,24,6061375928776353564,91913.0,4121,2,0,1110,41.1
9721988,820,0,2015,5,20,-4282466774399734331,49548.0,4829,0,0,480,40.0
2028951,166,1,2013,8,9,1799189980464955940,27819.0,5499,2,0,561,37.23
14948479,1225,1,2011,7,22,3812161635890104827,33193.0,4111,2,0,470,142.97
11001830,912,0,2016,12,22,6666504894937430109,20724.0,5499,0,0,1279,1.33


In [43]:
# Combine with X-fraud
for i in range(5):
    globals()['X_train_'+str(i+1)] = pd.concat([globals()['X_non_fraud_'+str(i+1)], X_fraud])

X_train_1.head()

Unnamed: 0,User,Card,Year,Month,Day,Merchant Name,Zip,MCC,Use_Chip,Error_Code,Time_min,Amount_$
10094879,849,3,2006,5,24,6061375928776353564,91913.0,4121,2,0,1110,41.1
9721988,820,0,2015,5,20,-4282466774399734331,49548.0,4829,0,0,480,40.0
2028951,166,1,2013,8,9,1799189980464955940,27819.0,5499,2,0,561,37.23
14948479,1225,1,2011,7,22,3812161635890104827,33193.0,4111,2,0,470,142.97
11001830,912,0,2016,12,22,6666504894937430109,20724.0,5499,0,0,1279,1.33


In [44]:
# Get the corresponding y_train sets
for i in range(5):
    globals()['y_train_'+str(i+1)] = y[globals()['X_train_'+str(i+1)].index]

y_train_1

array([0, 0, 0, ..., 1, 1, 1])

<div class="alert alert-block alert-success">

### ALL DONE WITH DATA PRE-PROCESSING YEEYYY !!!

### 8. Model training and evaluation

In [45]:
rfc = RandomForestClassifier()
model1 = rfc.fit(X_train_1, y_train_1)
print(f'Train Score:{model1.score(X_train_1, y_train_1):.2%} \nFull Train Score:{model1.score(X_train, y_train):.2%} \nTest Score:{model1.score(X_test, y_test):.2%}')

Train Score:100.00% 
Full Train Score:96.26% 
Test Score:96.26%


In [46]:
model2 = rfc.fit(X_train_2, y_train_2)
print(f'Train Score:{model2.score(X_train_2, y_train_2):.2%} \nFull Train Score:{model2.score(X_train, y_train):.2%} \nTest Score:{model2.score(X_test, y_test):.2%}')

Train Score:100.00% 
Full Train Score:96.31% 
Test Score:96.30%


In [47]:
model3 = rfc.fit(X_train_3, y_train_3)
print(f'Train Score:{model3.score(X_train_3, y_train_3):.2%} \nFull Train Score:{model3.score(X_train, y_train):.2%} \nTest Score:{model3.score(X_test, y_test):.2%}')

Train Score:100.00% 
Full Train Score:96.34% 
Test Score:96.33%


In [48]:
model4 = rfc.fit(X_train_4, y_train_4)
print(f'Train Score:{model4.score(X_train_4, y_train_4):.2%} \nFull Train Score:{model4.score(X_train, y_train):.2%} \nTest Score:{model4.score(X_test, y_test):.2%}')

Train Score:100.00% 
Full Train Score:96.29% 
Test Score:96.28%


In [49]:
model5 = rfc.fit(X_train_5, y_train_5)
print(f'Train Score:{model5.score(X_train_5, y_train_5):.2%} \nFull Train Score:{model5.score(X_train, y_train):.2%} \nTest Score:{model5.score(X_test, y_test):.2%}')

Train Score:100.00% 
Full Train Score:96.44% 
Test Score:96.43%


### Conclusion
- Use model5