In [82]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score 

In [None]:
raw_df = pd.read_csv('../data/raw/fraudTrain.csv')
raw_df.drop(columns=['Unnamed: 0'], inplace=True)
raw_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [None]:
raw_df["trans_date_trans_time"] = pd.to_datetime(raw_df["trans_date_trans_time"])
raw_df["dob"] = pd.to_datetime(raw_df["dob"])
raw_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   trans_date_trans_time  1296675 non-null  datetime64[ns]
 1   cc_num                 1296675 non-null  int64         
 2   merchant               1296675 non-null  object        
 3   category               1296675 non-null  object        
 4   amt                    1296675 non-null  float64       
 5   first                  1296675 non-null  object        
 6   last                   1296675 non-null  object        
 7   gender                 1296675 non-null  object        
 8   street                 1296675 non-null  object        
 9   city                   1296675 non-null  object        
 10  state                  1296675 non-null  object        
 11  zip                    1296675 non-null  int64         
 12  lat                    12966

In [None]:
print(f"Row drop: {len(raw_df) - len(raw_df.dropna())}")
raw_df.dropna(ignore_index=True)

duplicate = raw_df.duplicated().sum()
print("Row duplicate:", duplicate)

Row drop: 0
Row duplicate: 0


In [55]:
# Chia datetime thành các đặc trưng
raw_df['trans_year'] = raw_df['trans_date_trans_time'].dt.year
raw_df['trans_month'] = raw_df['trans_date_trans_time'].dt.month
raw_df['trans_day'] = raw_df['trans_date_trans_time'].dt.day
raw_df['trans_hour'] = raw_df['trans_date_trans_time'].dt.hour
raw_df['trans_minute'] = raw_df['trans_date_trans_time'].dt.minute
raw_df['trans_second'] = raw_df['trans_date_trans_time'].dt.second

raw_df['dob_year'] = raw_df['dob'].dt.year
raw_df['dob_month'] = raw_df['dob'].dt.month
raw_df['dob_day'] = raw_df['dob'].dt.day


In [56]:
raw_df['amt'].describe()

count    1.296675e+06
mean     7.035104e+01
std      1.603160e+02
min      1.000000e+00
25%      9.650000e+00
50%      4.752000e+01
75%      8.314000e+01
max      2.894890e+04
Name: amt, dtype: float64

In [57]:
encoder = LabelEncoder()
raw_df["merchant"] = encoder.fit_transform(raw_df["merchant"])
raw_df["category"] = encoder.fit_transform(raw_df["category"])
raw_df["gender"] = encoder.fit_transform(raw_df["gender"])
raw_df["job"] = encoder.fit_transform(raw_df["job"])

In [58]:
raw_df.drop(columns=['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],inplace=True)

In [59]:
X = raw_df.drop(columns=["is_fraud"], inplace = False)
Y = raw_df["is_fraud"]

In [60]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 21 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   merchant      1296675 non-null  int32  
 1   category      1296675 non-null  int32  
 2   amt           1296675 non-null  float64
 3   gender        1296675 non-null  int32  
 4   lat           1296675 non-null  float64
 5   long          1296675 non-null  float64
 6   city_pop      1296675 non-null  int64  
 7   job           1296675 non-null  int32  
 8   unix_time     1296675 non-null  int64  
 9   merch_lat     1296675 non-null  float64
 10  merch_long    1296675 non-null  float64
 11  is_fraud      1296675 non-null  int64  
 12  trans_year    1296675 non-null  int32  
 13  trans_month   1296675 non-null  int32  
 14  trans_day     1296675 non-null  int32  
 15  trans_hour    1296675 non-null  int32  
 16  trans_minute  1296675 non-null  int32  
 17  trans_second  1296675 non-n

In [61]:
model = SVC()
model.fit(X, Y)

In [71]:
test_data = pd.read_csv("../data/raw/fraudTest.csv")

In [73]:
test_data["trans_date_trans_time"] = pd.to_datetime(test_data["trans_date_trans_time"])
test_data["dob"] = pd.to_datetime(test_data["dob"])

In [75]:
# Chia datetime thành các đặc trưng
test_data['trans_year'] = test_data['trans_date_trans_time'].dt.year
test_data['trans_month'] = test_data['trans_date_trans_time'].dt.month
test_data['trans_day'] = test_data['trans_date_trans_time'].dt.day
test_data['trans_hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['trans_minute'] = test_data['trans_date_trans_time'].dt.minute
test_data['trans_second'] = test_data['trans_date_trans_time'].dt.second

test_data['dob_year'] = test_data['dob'].dt.year
test_data['dob_month'] = test_data['dob'].dt.month
test_data['dob_day'] = test_data['dob'].dt.day


In [76]:
test_data.drop(columns=['Unnamed: 0','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],inplace=True)

In [77]:
encoder = LabelEncoder()
test_data["merchant"] = encoder.fit_transform(test_data["merchant"])
test_data["category"] = encoder.fit_transform(test_data["category"])
test_data["gender"] = encoder.fit_transform(test_data["gender"])
test_data["job"] = encoder.fit_transform(test_data["job"])

In [78]:
X_test = test_data.drop(columns=["is_fraud"], inplace = False)
Y_test = test_data["is_fraud"]

In [81]:
y_pred = model.predict(X_test)
y_pred

KeyboardInterrupt: 

In [83]:
accuracy = accuracy_score(test_data['is_fraud'],y_pred)
accuracy

0.9961401355721147