In [2]:

import pandas as pd
import numpy as np


random_seed = 42
np.random.seed(random_seed)


df = pd.read_csv('/content/FastagFraudDetection.csv')


print("First few rows of the dataset:")
print(df.head())


print("\nDataset Info:")
df.info()


print("\nDataset Description:")
print(df.describe())


df = df.drop(columns=['Transaction_ID', 'Timestamp', 'FastagID', 'TollBoothID'])


categorical_columns = ['Vehicle_Type', 'Lane_Type', 'Geographical_Location', 'Vehicle_Plate_Number']
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()


First few rows of the dataset:
   Transaction_ID       Timestamp Vehicle_Type         FastagID TollBoothID  \
0               1  1/6/2023 11:20         Bus   FTG-001-ABC-121       A-101   
1               2  1/7/2023 14:55          Car  FTG-002-XYZ-451       B-102   
2               3  1/8/2023 18:25   Motorcycle              NaN       D-104   
3               4   1/9/2023 2:05        Truck  FTG-044-LMN-322       C-103   
4               5  1/10/2023 6:35          Van  FTG-505-DEF-652       B-102   

  Lane_Type Vehicle_Dimensions  Transaction_Amount  Amount_paid  \
0   Express              Large                 350          120   
1   Regular              Small                 120          100   
2   Regular              Small                   0            0   
3   Regular              Large                 350          120   
4   Express             Medium                 140          100   

                   Geographical_Location  Vehicle_Speed Vehicle_Plate_Number  \
0  13.05981

In [3]:

df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())


for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])


print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
Vehicle_Type             0
Lane_Type                0
Vehicle_Dimensions       0
Transaction_Amount       0
Amount_paid              0
Geographical_Location    0
Vehicle_Speed            0
Vehicle_Plate_Number     0
Fraud_indicator          0
dtype: int64


In [4]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


X = df.drop(columns=['Fraud_indicator'])
y = df['Fraud_indicator']


numerical_transformer = StandardScaler()


categorical_transformer = OneHotEncoder(handle_unknown='ignore')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

models = {
    'Random Forest': RandomForestClassifier(random_state=random_seed),
    'Gradient Boosting': GradientBoostingClassifier(random_state=random_seed),
    'Support Vector Machine': SVC(random_state=random_seed),
    'Logistic Regression': LogisticRegression(random_state=random_seed),
    'Neural Network': MLPClassifier(random_state=random_seed)
}


model_performance = {}

for name, model in models.items():
    print(f"\n{name}:")


    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='Fraud')
    recall = recall_score(y_test, y_pred, pos_label='Fraud')
    f1 = f1_score(y_test, y_pred, pos_label='Fraud')

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    model_performance[name] = {
        'model': clf,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }



Random Forest:
Accuracy: 0.98
Precision: 1.0
Recall: 0.9078341013824884
F1 Score: 0.9516908212560387

Classification Report:
              precision    recall  f1-score   support

       Fraud       1.00      0.91      0.95       217
   Not Fraud       0.98      1.00      0.99       783

    accuracy                           0.98      1000
   macro avg       0.99      0.95      0.97      1000
weighted avg       0.98      0.98      0.98      1000


Gradient Boosting:
Accuracy: 0.966
Precision: 1.0
Recall: 0.8433179723502304
F1 Score: 0.915

Classification Report:
              precision    recall  f1-score   support

       Fraud       1.00      0.84      0.92       217
   Not Fraud       0.96      1.00      0.98       783

    accuracy                           0.97      1000
   macro avg       0.98      0.92      0.95      1000
weighted avg       0.97      0.97      0.96      1000


Support Vector Machine:
Accuracy: 0.972
Precision: 1.0
Recall: 0.8709677419354839
F1 Score: 0.9310344

In [6]:

best_model_name = max(model_performance, key=lambda x: model_performance[x]['f1'])
best_model = model_performance[best_model_name]['model']
print(f"\nBest Model: {best_model_name} with F1 Score: {model_performance[best_model_name]['f1']}")




Best Model: Random Forest with F1 Score: 0.9516908212560387


In [7]:

import joblib


joblib.dump(best_model, 'best_model.pkl')


np.save('random_seed.npy', random_seed)



In [8]:

new_data = {
    'Vehicle_Type': 'Car',
    'Lane_Type': 'Normal',
    'Vehicle_Dimensions': 3.0,
    'Transaction_Amount': 150.0,
    'Amount_paid': 150.0,
    'Geographical_Location': 'Location1',
    'Vehicle_Speed': 60.0,
    'Vehicle_Plate_Number': 'ABC1234'
}


new_data_df = pd.DataFrame([new_data])


prediction = best_model.predict(new_data_df)
print(f'\nPrediction for new data using {best_model_name}:')
print('Fraudulent Transaction' if prediction[0] == 'Fraud' else 'Non-Fraudulent Transaction')


Prediction for new data using Random Forest:
Non-Fraudulent Transaction
