In [1]:
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [2]:
df_train = pd.read_csv("fraudTrain.csv")

df_test = pd.read_csv("fraudTest.csv")

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [5]:
df_train.shape, df_test.shape

((1296675, 23), (555719, 23))

In [6]:
df = pd.concat([df_train, df_test],ignore_index=True)

In [7]:
df.shape

(1852394, 23)

In [8]:
df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [9]:
def clean_df(df):
    return df.drop(['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)

df = clean_df(df)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0
1,1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0
2,2,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0
3,3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0
4,4,fraud_Keeling-Crist,misc_pos,41.96,M,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 13 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   merchant    object 
 2   category    object 
 3   amt         float64
 4   gender      object 
 5   lat         float64
 6   long        float64
 7   city_pop    int64  
 8   job         object 
 9   unix_time   int64  
 10  merch_lat   float64
 11  merch_long  float64
 12  is_fraud    int64  
dtypes: float64(5), int64(4), object(4)
memory usage: 183.7+ MB


In [12]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((1481915, 13), (370479, 13))

In [13]:
def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in df_obj.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    with open('LE_mdl_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

train = encode(train)

In [14]:
train.head()

Unnamed: 0.1,Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,1273644,29,4,166.8,1,39.3426,-114.8859,450,254,1371108903,40.088507,-113.895268,0
1,601398,536,11,28.86,0,34.3795,-118.523,34882,219,1347476946,35.356925,-119.348148,0
2,999645,153,2,37.93,0,40.3207,-110.436,302,406,1360819865,40.422976,-110.786285,0
3,1180310,677,9,18.7,1,41.2244,-86.6966,5791,134,1367973195,40.254936,-85.751919,0
4,213847,688,8,33.54,0,31.929,-97.6443,2526,342,1378348271,32.397579,-97.395488,0


In [17]:
x = train.drop(columns=['is_fraud'])
y = train['is_fraud']

In [18]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

In [19]:
model = RandomForestClassifier()

In [20]:
def model_train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    with open(str(model)[:3] + '_mdl.pkl', 'wb') as f:
        pickle.dump(model,f)

In [66]:
x_val

Unnamed: 0.1,Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long
1165976,388942,364,6,73.91,1,44.2378,-95.2739,1507,261,1384969991,43.682717,-95.424644
581187,199224,142,8,7.11,1,34.6689,-86.2296,3395,140,1334274747,35.361077,-86.669313
645131,366327,396,7,5.96,1,33.9778,-86.5598,3996,17,1340056180,34.315110,-87.039543
822699,183883,540,5,5.58,0,30.5920,-97.2893,1766,351,1377369405,29.641292,-97.759870
348376,322884,575,12,3.63,0,38.8089,-78.7776,863,169,1382445381,39.238935,-79.092097
...,...,...,...,...,...,...,...,...,...,...,...,...
715219,188696,493,5,17.69,0,41.6414,-96.5232,1745,155,1333830823,42.338071,-95.805159
381599,150207,82,9,226.58,0,33.6150,-117.7114,45303,405,1376307363,33.831666,-117.180153
475246,458409,498,2,70.99,0,36.0244,-90.9288,7155,162,1386672295,36.204443,-91.747666
10365,114251,395,4,104.44,1,31.0000,-82.8555,419,11,1375258360,30.340172,-81.877640


In [21]:
model_train(model, x_train, y_train, x_val, y_val)

Accuracy Score:  0.9977529075554266
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    294805
           1       0.88      0.67      0.76      1578

    accuracy                           1.00    296383
   macro avg       0.94      0.83      0.88    296383
weighted avg       1.00      1.00      1.00    296383

