In [23]:
#Madhavan M
#Sriram V
#Sid
#Auston

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix

# Load the data
df = pd.read_csv('/kaggle/input/datathon/DATATHON_EVENT_DATASET.csv')

# Fill missing values (if any)
df.fillna(method='ffill', inplace=True)

# Convert 'Yes'/'No' in 'Fraud' column to 1 and 0
df['Fraud'] = df['Fraud'].map({'Yes': 1, 'No': 0})

# Define feature columns and target
X = df.drop(columns=['Fraud'])  # Features (drop the target column)
y = df['Fraud']  # Target

# Label encode columns with high cardinality (like Origin_ID and Destination_ID)
label_encoder = LabelEncoder()

X['Origin_ID'] = label_encoder.fit_transform(X['Origin_ID'])
X['Destination_ID'] = label_encoder.fit_transform(X['Destination_ID'])

# Categorical columns with few unique values, we can apply One-Hot Encoding (e.g., Transaction_Type, Expected_Fraud)
categorical_cols = ['Transaction_Type', 'Expected_Fraud']

# One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)],
    remainder='passthrough'
)

# Create pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Standardize numerical features
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Fit the pipeline model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1-Score: {f1}')
print('Confusion Matrix:')
print(cm)




  df.fillna(method='ffill', inplace=True)


Accuracy: 98.91%
F1-Score: 0.9604976671850699
Confusion Matrix:
[[9959   41]
 [  86 1544]]


In [24]:
# Team Members:
# Madhavan M
# Sriram V
# Sid
# Auston

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import numpy as np

# ---------------------- PART 1: Training and Saving the Model ---------------------- #

# Load the data
df = pd.read_csv('/kaggle/input/datathon/DATATHON_EVENT_DATASET.csv')

# Fill missing values (if any)
df.fillna(method='ffill', inplace=True)

# Convert 'Yes'/'No' in 'Fraud' column to 1 and 0
df['Fraud'] = df['Fraud'].map({'Yes': 1, 'No': 0})

# Define feature columns and target
X = df.drop(columns=['Fraud'])  # Features (drop the target column)
y = df['Fraud']  # Target

# Label encode columns with high cardinality (like Origin_ID and Destination_ID)
label_encoder = LabelEncoder()

X['Origin_ID'] = label_encoder.fit_transform(X['Origin_ID'])
X['Destination_ID'] = label_encoder.fit_transform(X['Destination_ID'])

# Categorical columns with few unique values, we can apply One-Hot Encoding (e.g., Transaction_Type, Expected_Fraud)
categorical_cols = ['Transaction_Type', 'Expected_Fraud']

# One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)],
    remainder='passthrough'
)

# Create pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Standardize numerical features
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Fit the pipeline model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate F1 Score, Confusion Matrix, and Accuracy
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1-Score: {f1}')
print('Confusion Matrix:')
print(cm)

# Save the entire pipeline (preprocessor + model)
joblib.dump(pipeline, 'fraud_detection_pipeline.pkl')
print("Model saved successfully.")

# ---------------------- PART 2: Loading the Model and Classifying New Data ---------------------- #

import pandas as pd
import joblib
import numpy as np

# Load the saved pipeline
pipeline = joblib.load('fraud_detection_pipeline.pkl')
print("Model loaded successfully.")

# Function to handle unseen labels in the custom data
def label_encode_custom(label_encoder, series):
    existing_classes = label_encoder.classes_
    new_classes = series[~series.isin(existing_classes)]

    # Add new classes to the LabelEncoder
    if len(new_classes) > 0:
        label_encoder.classes_ = np.append(existing_classes, new_classes)
    
    return label_encoder.transform(series)

# Sample data for a new transaction (make sure it matches the structure of the training data)
data = {
    'Time': [123],  # the time in real-world units
    'Transaction_Type': ['TRANSFER'],  # type of transaction
    'Amount': [5000.00],  # transaction amount
    'Origin_ID': ['C123456789'],  # customer who initiated the transaction
    'Initial_Origin_Balance': [10000.00],  # balance before transaction
    'Final_Origin_Balance': [5000.00],  # balance after transaction
    'Destination_ID': ['C987654321'],  # recipient of the transaction
    'Initial_Destination_Balance': [2000.00],  # recipient's balance before transaction
    'Final_Destination_Balance': [7000.00],  # recipient's balance after transaction
    'Expected_Fraud': ['No']  # if the transaction is expected to be fraudulent based on business rules
}

# Convert dictionary to DataFrame
new_transaction = pd.DataFrame(data)

# Apply the same preprocessing as done during model training
# Label Encoding for IDs, handling unseen labels
new_transaction['Origin_ID'] = label_encode_custom(label_encoder, new_transaction['Origin_ID'])
new_transaction['Destination_ID'] = label_encode_custom(label_encoder, new_transaction['Destination_ID'])

# Ensure that the columns of the new transaction match the original training data structure
new_transaction = new_transaction.reindex(columns=X_train.columns, fill_value=0)

# One-Hot Encoding for categorical features
# Assuming 'preprocessor' is your preprocessing pipeline that was fitted with your training data
new_transaction_processed = pipeline.named_steps['preprocessor'].transform(new_transaction)

# Predict using your trained pipeline
predicted_class = pipeline.predict(new_transaction_processed)

# Output whether the transaction is legit (0) or fraudulent (1)
if predicted_class[0] == 0:
    print("The transaction is legitimate.")
else:
    print("The transaction is fraudulent.")



  df.fillna(method='ffill', inplace=True)


Accuracy: 98.91%
F1-Score: 0.9604976671850699
Confusion Matrix:
[[9959   41]
 [  86 1544]]
Model saved successfully.
Model loaded successfully.


ValueError: X has 15 features, but ColumnTransformer is expecting 10 features as input.