In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import hashlib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,classification_report
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df=pd.read_csv('fraudTrain.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
def transform(X):
        def hash_transaction(transaction):
            hashed = hashlib.sha256(transaction.encode()).hexdigest()
            return hashed
        X['trans_date_trans_time'] = X['trans_date_trans_time'].astype(str)
        X['trans_date_trans_time'] = X['trans_date_trans_time'].str.strip('-')
        X['tran_date'] = X['trans_date_trans_time'].str[:10]
        X['tran_time'] = X['trans_date_trans_time'].str[10:]
        X.drop(columns=['trans_date_trans_time'], inplace=True)
        X['tran_yr'] = (X['tran_date'].str[:4]).astype(int)
        X['tran_m'] = (X['tran_date'].str[5:7]).astype(int)
        X['tran_day'] = (X['tran_date'].str[8:]).astype(int)
        X['tran_hr'] = (X['tran_time'].str[2]).astype(int)
        X['tran_min'] = (X['tran_time'].str[4:5]).astype(int)
        X['tran_sec'] = (X['tran_time'].str[7:]).astype(int)
        X.drop(columns=['tran_time', 'tran_date'], inplace=True)
        columns_to_drop=['merchant', 'first', 'last', 'street', 'city','job']
        X.drop(columns=columns_to_drop, inplace=True)
        X['gender'] = X['gender'].map({'M': 0, 'F': 1})
        one_hot_encoded = pd.get_dummies(X['state'], prefix='state')
        X = pd.concat([X, one_hot_encoded], axis=1)
        X.drop(columns=['state'], inplace=True)
        label_encoder = LabelEncoder()
        label_encoder.fit(X['category'])
        col='category'
        X[f'{col}_label_encoded'] = label_encoder.transform(X[col]).astype(int)
        X.drop(columns=[col], inplace=True)
        X['dob'] = pd.to_datetime(X['dob'])
        X['int_dob'] = X['dob'].astype('int64') // 10**9
        X.drop(columns=['dob'], inplace=True)
        X['hashed_trans_num'] = X['trans_num'].apply(hash_transaction)
        vectorizer = CountVectorizer(analyzer='char')
        X_transformed = vectorizer.fit_transform(X['trans_num'])
        df_tokenized = pd.DataFrame(X_transformed.toarray(), columns=vectorizer.get_feature_names_out())
        X = pd.concat([X, df_tokenized], axis=1)
        X.drop(columns=['trans_num', 'hashed_trans_num'], inplace=True)
        bool_columns = X.select_dtypes(include=bool)
        bool_columns_int = bool_columns.astype(int)
        X.drop(columns=bool_columns.columns, inplace=True)
        X = pd.concat([X, bool_columns_int], axis=1)
        return X

In [6]:
df=transform(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 87 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              890000 non-null  int64  
 1   cc_num                  890000 non-null  int64  
 2   amt                     890000 non-null  float64
 3   gender                  890000 non-null  int64  
 4   zip                     890000 non-null  int64  
 5   lat                     890000 non-null  float64
 6   long                    890000 non-null  float64
 7   city_pop                890000 non-null  int64  
 8   unix_time               890000 non-null  int64  
 9   merch_lat               890000 non-null  float64
 10  merch_long              890000 non-null  float64
 11  is_fraud                890000 non-null  int64  
 12  tran_yr                 890000 non-null  int32  
 13  tran_m                  890000 non-null  int32  
 14  tran_day            

In [8]:
X=df.drop(labels=['is_fraud'],axis=1)
y=df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train_scaled=StandardScaler().fit_transform(X_train)
X_test_Scaled=StandardScaler().fit_transform(X_test)

In [10]:
train_arr = np.c_[X_train_scaled, np.array(y_train)]

In [12]:
len(X_train_scaled)

596300

In [13]:
len(y_train)

596300

In [21]:
len(df)

890000