In [None]:
import pandas as pd
df = pd.read_csv("synthetic_fraud_dataset.csv")
df.head()

In [None]:
# 1. Count how many transactions are fraud vs. not fraud
fraud_counts = df['Fraud_Label'].value_counts()


In [None]:
# 2. Show the count results
print(fraud_counts)


In [None]:
# 3. Plot it visually as a bar chart
fraud_counts.plot(kind='bar', title='Fraud vs Non-Fraud Transactions')


In [None]:
# Group the data by Device_Type and Fraud_Label, then count each group
device_fraud_counts = df.groupby(['Device_Type', 'Fraud_Label']).size()

# Preview the grouped result
print(device_fraud_counts)


In [None]:
device_fraud_counts = device_fraud_counts.unstack()


In [None]:
device_fraud_counts.plot(kind='bar', stacked=True, title='Fraud vs Non-Fraud by Device Type')


In [None]:
# Step 1: Group by Location and Fraud_Label, then count
location_fraud_counts = df.groupby(['Location', 'Fraud_Label']).size()

# Step 2: Reshape with unstack
location_fraud_counts = location_fraud_counts.unstack()

# Step 3: Plot as stacked bar chart
location_fraud_counts.plot(kind='bar', stacked=True, figsize=(10, 6),
                           title='Fraud vs Non-Fraud by Location')


In [None]:
# Step 1: Group by Location and Fraud_Label, then count
location_fraud = df.groupby(['Location', 'Fraud_Label']).size().unstack()

# Step 2: Rename columns for clarity (0 = Not Fraud, 1 = Fraud)
location_fraud.columns = ['Not_Fraud', 'Fraud']

# Step 3: Calculate fraud rate as percentage
location_fraud['Fraud_Rate_%'] = (location_fraud['Fraud'] / 
                                   (location_fraud['Fraud'] + location_fraud['Not_Fraud'])) * 100

# Step 4: Sort by fraud rate (optional, for insight)
location_fraud_sorted = location_fraud.sort_values(by='Fraud_Rate_%', ascending=False)

# Step 5: Show result
print(location_fraud_sorted[['Fraud_Rate_%']])


In [None]:
import pandas as pd  # For working with data
from sklearn.model_selection import train_test_split  # To split data


In [None]:
# Target column - what we're trying to predict (0 = Not Fraud, 1 = Fraud)
y = df['Fraud_Label']

# Features - the inputs used to make the prediction
X = df[['Transaction_Type', 'Device_Type', 'Location', 'Is_Weekend', 'Transaction_Amount']]


In [None]:
# Convert categorical columns into numeric dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)


In [None]:
# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
# Accuracy: How many total predictions were correct
accuracy = accuracy_score(y_test, y_pred)

# Precision: Of predicted frauds, how many were actually fraud
precision = precision_score(y_test, y_pred)

# Recall: Of all actual frauds, how many were caught
recall = recall_score(y_test, y_pred)

# F1 Score: Balance between precision & recall
f1 = f1_score(y_test, y_pred)

# Confusion Matrix: TP, TN, FP, FN breakdown
cm = confusion_matrix(y_test, y_pred)



In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)


In [None]:
# Accuracy: How many total predictions were correct
accuracy = accuracy_score(y_test, y_pred)

# Precision: Of predicted frauds, how many were actually fraud
precision = precision_score(y_test, y_pred)

# Recall: Of all actual frauds, how many were caught
recall = recall_score(y_test, y_pred)

# F1 Score: Balance between precision & recall
f1 = f1_score(y_test, y_pred)

# Confusion Matrix: TP, TN, FP, FN breakdown
cm = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)


In [None]:
!pip install imbalanced-learn


In [None]:
from imblearn.over_sampling import SMOTE

# Create a SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE only to training data
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


In [None]:
import pandas as pd

print(pd.Series(y_train_balanced).value_counts())


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_balanced, y_train_balanced)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Step 1: Predict on the test set
y_pred = model.predict(X_test)

# Step 2: Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

# Step 3: Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define labels
labels = ['Not Fraud', 'Fraud']

# Create heatmap
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

# Label the plot
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Fraud Detection')
plt.show()


In [None]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Create and train the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict using the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Print results
print("Random Forest Performance:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("Confusion Matrix:\n", cm_rf)


In [None]:
# 1. Import the KNN classifier
from sklearn.neighbors import KNeighborsClassifier

# 2. Create the model (we'll use 5 neighbors to start)
knn_model = KNeighborsClassifier(n_neighbors=5)

# 3. Train the model using the SMOTE-balanced training set
knn_model.fit(X_train_balanced, y_train_balanced)

# 4. Predict using the test set
y_pred_knn = knn_model.predict(X_test)

# 5. Evaluate performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
cm_knn = confusion_matrix(y_test, y_pred_knn)

# 6. Print the results
print("K-Nearest Neighbors Performance:")
print("Accuracy:", accuracy_knn)
print("Precision:", precision_knn)
print("Recall:", recall_knn)
print("F1 Score:", f1_knn)
print("Confusion Matrix:\n", cm_knn)


In [None]:
# 1. Import the model
from sklearn.svm import SVC

# 2. Create the model
svc_model = SVC(kernel='linear', random_state=42)

# 3. Train the model on balanced data
svc_model.fit(X_train_balanced, y_train_balanced)

# 4. Predict using the test data
y_pred_svc = svc_model.predict(X_test)

# 5. Evaluate performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)
cm_svc = confusion_matrix(y_test, y_pred_svc)

# 6. Print results
print("Support Vector Machine (SVM) Performance:")
print("Accuracy:", accuracy_svc)
print("Precision:", precision_svc)
print("Recall:", recall_svc)
print("F1 Score:", f1_svc)
print("Confusion Matrix:\n", cm_svc)


In [None]:
# Make predictions
y_pred_svc = svc_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)
cm_svc = confusion_matrix(y_test, y_pred_svc)

# Print results
print("Support Vector Machine (SVM) Performance:")
print("Accuracy:", accuracy_svc)
print("Precision:", precision_svc)
print("Recall:", recall_svc)
print("F1 Score:", f1_svc)
print("Confusion Matrix:\n", cm_svc)


In [None]:
import pandas as pd

# Create a dictionary with your results
model_results = {
    "Model": ["Logistic Regression", "Random Forest", "K-Nearest Neighbors", "Support Vector Machine"],
    "Accuracy": [0.4933, 0.5491, 0.5223, accuracy_svc],
    "Precision": [0.3233, 0.3277, 0.3199, precision_svc],
    "Recall": [0.5181, 0.3747, 0.4235, recall_svc],
    "F1 Score": [0.3981, 0.3496, 0.3645, f1_svc]
}

# Convert to DataFrame
results_df = pd.DataFrame(model_results)

# Display nicely
print("📌 Model Performance Comparison:")
display(results_df.sort_values(by="F1 Score", ascending=False))


In [None]:
import matplotlib.pyplot as plt

# Plot the F1 Score comparison
results_df.plot(kind='bar', x='Model', y='F1 Score', legend=False)
plt.title('Model Comparison - F1 Score')
plt.ylabel('F1 Score')
plt.xlabel('Model')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])

# Calculate AUC score
auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

# Plot ROC curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

# Get precision-recall curve data
precision, recall, _ = precision_recall_curve(y_test, model.predict_proba(X_test)[:,1])

# Calculate AUC for the Precision-Recall curve
pr_auc = auc(recall, precision)

# Plot Precision-Recall curve
plt.figure(figsize=(8,6))
plt.plot(recall, precision, color='blue', label=f'AUC = {pr_auc:.2f}')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()


In [None]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'fraud_detection_model.pkl')

# Load the model back
model_loaded = joblib.load('fraud_detection_model.pkl')

# Make predictions with the loaded model
y_pred_loaded = model_loaded.predict(X_test)


In [None]:
# Try importing all the libraries you need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import joblib
