<a href="https://colab.research.google.com/github/Mishra-0709/C-program/blob/cat/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load the Data

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('fraudTest.csv')

# Display the first few rows of the dataframe
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Convert 'trans_date_trans_time' and 'dob' to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop unnecessary columns
df.drop(['cc_num', 'first', 'last', 'street', 'city', 'state',
         'zip', 'lat', 'long', 'job', 'trans_num', 'unix_time', 'merch_lat', 'merch_long'], axis=1, inplace=True)

# Check the cleaned dataframe
print(df.head())

   Unnamed: 0 trans_date_trans_time        cc_num  \
0           0   2020-06-21 12:14:25  2.291164e+15   
1           1   2020-06-21 12:14:33  3.573030e+15   
2           2   2020-06-21 12:14:53  3.598215e+15   
3           3   2020-06-21 12:15:15  3.591920e+15   
4           4   2020-06-21 12:15:17  3.526826e+15   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 -110.4360   
2     Lopez

Feature Engineering

In [2]:
# Extract features from datetime columns
df['trans_year'] = df['trans_date_trans_time'].dt.year
df['trans_month'] = df['trans_date_trans_time'].dt.month
df['trans_day'] = df['trans_date_trans_time'].dt.day
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

# Drop the processed datetime columns
df.drop(['trans_date_trans_time', 'dob'], axis=1, inplace=True)

# Handle missing values for 'age'
df['age'].fillna(df['age'].median(), inplace=True)

# Check the dataframe after feature engineering
print(df.head())

   Unnamed: 0                              merchant        category    amt  \
0           0                 fraud_Kirlin and Sons   personal_care   2.86   
1           1                  fraud_Sporer-Keebler   personal_care  29.84   
2           2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28   
3           3                     fraud_Haley Group        misc_pos  60.05   
4           4                 fraud_Johnston-Casper          travel   3.19   

  gender  city_pop  is_fraud  trans_year  trans_month  trans_day  age  
0      M    333497         0        2020            6         21   52  
1      F       302         0        2020            6         21   30  
2      F     34496         0        2020            6         21   50  
3      M     54767         0        2020            6         21   33  
4      M      1126         0        2020            6         21   65  


Split the Data

In [3]:
from sklearn.model_selection import train_test_split

# Features and target variable
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(45874, 10) (19661, 10) (45874,) (19661,)


Model Building

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Preprocessing for numerical and categorical features
numeric_features = ['amt', 'city_pop', 'trans_year', 'trans_month', 'trans_day', 'age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['merchant', 'category', 'gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Logistic Regression Model
clf_lr = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', LogisticRegression(max_iter=1000))])

# Decision Tree Model
clf_dt = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', DecisionTreeClassifier(random_state=42))])

# Random Forest Model
clf_rf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', RandomForestClassifier(random_state=42))])

Model Evaluation

In [7]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Train and evaluate models
models = {'Logistic Regression': clf_lr, 'Decision Tree': clf_dt, 'Random Forest': clf_rf}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# Assuming Random Forest performed the best
best_model = clf_rf
y_pred_best = best_model.predict(X_test)

# Confusion Matrix for the best model
conf_matrix = confusion_matrix(y_test, y_pred_best)

# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(10, 7))

Model: Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19565
           1       0.00      0.00      0.00        96

    accuracy                           0.99     19661
   macro avg       0.50      0.50      0.50     19661
weighted avg       0.99      0.99      0.99     19661

[[19550    15]
 [   96     0]]


Model: Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19565
           1       0.54      0.46      0.50        96

    accuracy                           1.00     19661
   macro avg       0.77      0.73      0.75     19661
weighted avg       1.00      1.00      1.00     19661

[[19528    37]
 [   52    44]]


Model: Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19565
           1       0.94      0.17      0.28        96

    accuracy                           1.00    

<Figure size 1000x700 with 0 Axes>

<Figure size 1000x700 with 0 Axes>