In [1]:
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [2]:
df_train = pd.read_csv("fraudTrain.csv")

df_test = pd.read_csv("fraudTest.csv")

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
fraudulent_transactions = df_train[df_train['is_fraud'] == 1]
fraudulent_transactions = fraudulent_transactions.drop(['first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)
print(fraudulent_transactions[1000:1005])

        Unnamed: 0          cc_num                             merchant  \
100507      100507    630425673344     fraud_Rohan, White and Aufderhar   
100517      100517    630425673344  fraud_Bernhard, Grant and Langworth   
100548      100548    630425673344  fraud_Johnston, Nikolaus and Maggio   
100591      100591  30074693890476               fraud_Denesik and Sons   
100635      100635  30074693890476                     fraud_Jewess LLC   

            category     amt gender      lat      long  city_pop  \
100507      misc_net  862.09      M  38.3880  -79.9906       365   
100517  shopping_pos  880.99      M  38.3880  -79.9906       365   
100548   grocery_net   14.69      M  38.3880  -79.9906       365   
100591  shopping_pos  788.90      F  37.9931 -100.9893      2691   
100635  shopping_pos  755.85      F  37.9931 -100.9893      2691   

                           job   unix_time  merch_lat  merch_long  is_fraud  
100507  Special effects artist  1330472643  38.760167  -79.202

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [6]:
df_train.shape, df_test.shape

((1296675, 23), (555719, 23))

In [7]:
df = pd.concat([df_train, df_test],ignore_index=True)

In [8]:
df.shape

(1852394, 23)

In [9]:
df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [12]:
fraud_transactions = df[df['is_fraud'] == 1].sample(n=12000, replace=True)
non_fraud_transactions = df[df['is_fraud'] == 0].sample(n=12000, replace=True)
data = pd.concat([fraud_transactions, non_fraud_transactions])

In [13]:
data.is_fraud.value_counts()

is_fraud
1    12000
0    12000
Name: count, dtype: int64

In [14]:
data.to_csv('balance_data.csv')

In [27]:
def clean_df(df):
    return df.drop(['first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)

df = clean_df(data)

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
52632,52632,4952583804639909,fraud_Wuckert-Walter,grocery_net,13.13,F,41.2639,-80.8164,75903,Magazine features editor,1328062061,41.423442,-81.329235,1
603575,603575,3598215285024754,fraud_Langworth LLC,personal_care,18.38,F,40.6729,-73.5365,34496,"Librarian, public",1347575165,41.058226,-73.124452,1
412130,412130,4467456915153312231,"fraud_Reichert, Rowe and Mraz",shopping_net,980.72,F,28.06,-82.4079,717255,Waste management officer,1341527376,28.162554,-83.33185,1
466302,466302,375848982312810,fraud_Kuphal-Predovic,misc_net,1112.91,M,31.4841,-97.9903,258,Web designer,1343158696,30.54581,-98.564395,1
1065775,1065775,213154573301411,"fraud_Weimann, Kuhic and Beahan",shopping_pos,650.33,M,38.1981,-86.6821,965,Horticultural therapist,1363562425,37.558937,-87.308698,1


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24000 entries, 52632 to 1773244
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  24000 non-null  int64  
 1   cc_num      24000 non-null  int64  
 2   merchant    24000 non-null  object 
 3   category    24000 non-null  object 
 4   amt         24000 non-null  float64
 5   gender      24000 non-null  object 
 6   lat         24000 non-null  float64
 7   long        24000 non-null  float64
 8   city_pop    24000 non-null  int64  
 9   job         24000 non-null  object 
 10  unix_time   24000 non-null  int64  
 11  merch_lat   24000 non-null  float64
 12  merch_long  24000 non-null  float64
 13  is_fraud    24000 non-null  int64  
dtypes: float64(5), int64(5), object(4)
memory usage: 2.7+ MB


In [30]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((19200, 14), (4800, 14))

In [31]:
def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in df_obj.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    with open('LE_mdl_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

train = encode(train)

In [32]:
train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,18509,3573386445588324,586,0,614.72,0,45.7205,-98.5534,63,447,1372373713,44.920705,-99.264799,1
1,1020673,4158008602445,236,0,11.12,0,44.5274,-93.0196,1201,44,1361829795,44.768648,-94.01211,0
2,223215,4710826438164847414,380,5,10.19,1,48.8328,-108.3961,192,206,1378652065,49.755405,-108.714337,0
3,805079,3518669219150142,296,10,31.9,0,39.1657,-84.233,31394,232,1354729671,39.951145,-83.574851,0
4,41710,6011149206456997,111,6,88.43,1,39.2667,-77.5101,100,362,1373062301,39.933154,-77.2541,0


In [33]:
x = train.drop(columns=['is_fraud'])
y = train['is_fraud']

In [34]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

In [35]:
model = RandomForestClassifier()

In [36]:
def model_train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    with open(str(model)[:3] + '_mdl.pkl', 'wb') as f:
        pickle.dump(model,f)

In [37]:
x_val

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long
3234,64712,180017442990269,94,4,282.12,0,42.6853,-73.8253,151022,129,1328663517,43.211087,-73.696975
9459,280616,675990301623,335,13,9.94,0,41.0767,-74.5982,2456,421,1337379144,40.286563,-75.047746
14698,86148,4497913965512794052,94,4,316.40,1,34.6902,-79.1834,14783,232,1329694660,34.569072,-79.171089
4347,905692,4715741951931168360,217,4,279.46,1,36.6966,-96.7869,471,439,1356686183,36.312266,-95.826075
1512,949292,213173753804333,147,4,106.66,1,41.2244,-86.6966,5791,134,1358236840,41.006179,-87.095555
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3851,831540,344342339068828,470,0,58.89,0,31.3826,-81.4312,5989,449,1355207720,31.572986,-82.205331
12082,203311,30357372465631,102,5,62.78,1,43.6718,-89.1918,1628,84,1334409343,42.902728,-89.832897
1372,163575,375904527651269,336,4,335.02,1,40.4591,-98.5551,463,313,1332885655,40.051279,-99.186630
984,468976,375848982312810,434,8,822.94,1,31.4841,-97.9903,258,494,1343257659,30.551803,-98.761695


In [38]:
model_train(model, x_train, y_train, x_val, y_val)

Accuracy Score:  0.96875
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1918
           1       0.97      0.97      0.97      1922

    accuracy                           0.97      3840
   macro avg       0.97      0.97      0.97      3840
weighted avg       0.97      0.97      0.97      3840

