In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, fbeta_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [3]:
# Load the dataset
data = pd.read_csv('fraudTrain.csv')

In [4]:
# Display the first few rows of the dataset and the column names
data.head(10)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0
5,5,2019-01-01 00:04:08,4767265376804500,"fraud_Stroman, Hudson and Erdman",gas_transport,94.63,Jennifer,Conner,F,4655 David Island,...,40.375,-75.2045,2158.0,Transport planner,1961-06-19,189a841a0a8ba03058526bcfe566aab5,1325376000.0,40.653382,-76.152667,0.0
6,6,2019-01-01 00:04:42,30074693890476,fraud_Rowe-Vandervort,grocery_net,44.54,Kelsey,Richards,F,889 Sarah Station Suite 624,...,37.9931,-100.9893,2691.0,Arboriculturist,1993-08-16,83ec1cc84142af6e2acf10c44949e720,1325376000.0,37.162705,-100.15337,0.0
7,7,2019-01-01 00:05:08,6011360759745864,fraud_Corwin-Collins,gas_transport,71.65,Steven,Williams,M,231 Flores Pass Suite 720,...,38.8432,-78.6003,6018.0,"Designer, multimedia",1947-08-21,6d294ed2cc447d2c71c7171a3d54967c,1325376000.0,38.948089,-78.540296,0.0
8,8,2019-01-01 00:05:18,4922710831011201,fraud_Herzog Ltd,misc_pos,4.27,Heather,Chase,F,6888 Hicks Stream Suite 954,...,40.3359,-79.6607,1472.0,Public affairs consultant,1941-03-07,fc28024ce480f8ef21a32d64c93a29f5,1325376000.0,40.351813,-79.958146,0.0
9,9,2019-01-01 00:06:01,2720830304681674,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,198.39,Melissa,Aguilar,F,21326 Taylor Squares Suite 708,...,36.522,-87.349,151785.0,Pathologist,1974-03-28,3b9014ea8fb80bd65de0b1463b00b00e,1325376000.0,37.179198,-87.485381,0.0


In [5]:
# Clean the column names by stripping whitespace
data.columns = data.columns.str.strip()

In [6]:
# Data preprocessing
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
data['trans_year'] = data['trans_date_trans_time'].dt.year
data['trans_month'] = data['trans_date_trans_time'].dt.month
data['trans_day'] = data['trans_date_trans_time'].dt.day
data['trans_hour'] = data['trans_date_trans_time'].dt.hour
data['trans_minute'] = data['trans_date_trans_time'].dt.minute
data['trans_second'] = data['trans_date_trans_time'].dt.second

In [7]:
# Display the current columns after feature extraction
print("Current columns after feature extraction:", data.columns.tolist())

Current columns after feature extraction: ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'trans_year', 'trans_month', 'trans_day', 'trans_hour', 'trans_minute', 'trans_second']


In [8]:
# Drop unnecessary columns
data.drop(['trans_num', 'unix_time', 'first', 'last', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'dob', 'trans_date_trans_time'], axis=1, inplace=True)

In [9]:
# Convert categorical variables to dummy/indicator variables
data = pd.get_dummies(data, columns=['merchant', 'category', 'gender', 'job'], drop_first=True)

In [10]:
# Features and target variable
X = data.drop('is_fraud', axis=1)  # Features
y = data['is_fraud']  # Target variable

In [11]:
# Handle missing values in y
if y.isnull().any():
    print("Missing values found in target variable 'y'. Dropping these rows.")
    # Drop rows with missing target values
    valid_indices = y.notnull()
    X = X[valid_indices]
    y = y[valid_indices]

Missing values found in target variable 'y'. Dropping these rows.


In [12]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [13]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [15]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

In [17]:
# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

In [21]:
# Define a custom F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

In [22]:
# Random Forest with Hyperparameter Tuning
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [23]:
# Use the custom F2 scorer in GridSearchCV
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, scoring=f2_scorer, cv=3, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

In [24]:
# Get the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_
y_pred_forest = best_rf_model.predict(X_test)

In [25]:
# Evaluate and print classification reports
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logistic))

print("Decision Tree:")
print(classification_report(y_test, y_pred_tree))

print("Random Forest (Best Model):")
print(classification_report(y_test, y_pred_forest))

Logistic Regression:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1562
         1.0       0.99      1.00      1.00      1546

    accuracy                           1.00      3108
   macro avg       1.00      1.00      1.00      3108
weighted avg       1.00      1.00      1.00      3108

Decision Tree:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1562
         1.0       0.99      1.00      1.00      1546

    accuracy                           1.00      3108
   macro avg       1.00      1.00      1.00      3108
weighted avg       1.00      1.00      1.00      3108

Random Forest (Best Model):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1562
         1.0       1.00      1.00      1.00      1546

    accuracy                           1.00      3108
   macro avg       1.00      1.00      1.00      3108
weighted a