In [45]:
#Madhavan M
#Sriram V
#siddarth
#auston



#Data preprocessing is done as the ratio of no:yes is 5:1 and saved as cleaned dataset

import pandas as pd

# Load the dataset from CSV
df = pd.read_csv('/kaggle/input/datathon/DATATHON_EVENT_DATASET.csv')

# Drop the first 30,000 rows from the dataframe as it contains no
df_dropped = df.iloc[30000:].reset_index(drop=True)

# Display the shape of the updated dataframe
print("Shape of dataframe after dropping 30,000 rows:", df_dropped.shape)

# Save the cleaned dataframe to a new CSV file
df_dropped.to_csv('cleaned_dataset.csv', index=False)

print("Cleaned dataset saved successfully.")


Shape of dataframe after dropping 30,000 rows: (28150, 11)
Cleaned dataset saved successfully.


In [47]:
# Import libraries
#training and testing the model
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
import numpy as np

# Load the dataset from cleaned data CSV
df = pd.read_csv('/kaggle/input/cleaned-dataset/cleaned_dataset.csv')

# ---- Data Cleaning ----
# Use SimpleImputer for filling missing values based on the column type
# Filling numerical columns with median and categorical with mode
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values
df[numerical_cols] = SimpleImputer(strategy='median').fit_transform(df[numerical_cols])
df[categorical_cols] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_cols])

# ---- Label Conversion ----
# Convert 'Yes'/'No' in 'Fraud' column to 1 and 0
df['Fraud'] = df['Fraud'].map({'Yes': 1, 'No': 0})

# ---- Define Features (X) and Target (y) ----
X = df.drop(columns=['Fraud'])  # Features (drop the target column)
y = df['Fraud']  # Target (fraud label)

# ---- Handling High Cardinality Columns ----
# Label encode high cardinality columns (like Origin_ID and Destination_ID)
label_encoder = LabelEncoder()
X['Origin_ID'] = label_encoder.fit_transform(X['Origin_ID'])
X['Destination_ID'] = label_encoder.fit_transform(X['Destination_ID'])

# ---- One-Hot Encoding for Categorical Columns ----
# One-Hot Encode 'Transaction_Type' and 'Expected_Fraud' columns
categorical_cols = ['Transaction_Type', 'Expected_Fraud']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(drop='first'), categorical_cols)],  # One-Hot Encoding
    remainder='passthrough'  # Leave other columns as is (numerical ones)
)

# ---- Create the Pipeline ----
# The pipeline includes preprocessing steps (encoding + scaling) and the model (RandomForestClassifier)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing (One-Hot Encoding and pass-through other features)
    ('scaler', StandardScaler()),  # Scale numerical features
    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42))  # Random Forest with parameters
])

# ---- Splitting the Data ----
# Split the data into training and testing sets (60% train, 40% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# ---- Train the Model ----
# Train the model using the training set
pipeline.fit(X_train, y_train)

# ---- Predictions and Evaluation ----
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate F1 Score, Confusion Matrix, and Accuracy
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1-Score: {f1}')
print('Confusion Matrix:')
print(cm)

# ---- Save the Trained Model ----
# Save the model pipeline using joblib
joblib.dump(pipeline, 'fraud_detection_pipeline.pkl')
print("Model saved successfully.")


Accuracy: 98.13%
F1-Score: 0.967451952882827
Confusion Matrix:
[[7929   71]
 [ 139 3121]]
Model saved successfully.
