## Activity 4: Data Preparation and Cleaning for Credit card fraud detection

In [None]:
#Load the datasets 'fraudTrain.csv' and 'fraudTest.csv'
import pandas as pd

# Load datasets
train_df = pd.read_csv('fraudTrain.csv')
test_df = pd.read_csv('fraudTest.csv')

In [None]:
#Explore the data entries and shape of the datasets
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain columns:\n", train_df.columns)
print("\nSample train data:\n", train_df.head())

In [None]:
#Check the correlation between numerical features and plot heatmap in both datasets
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation heatmap for train set
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap - Train Set")
plt.show()

# Correlation heatmap for test set
plt.figure(figsize=(10, 6))
sns.heatmap(test_df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap - Test Set")
plt.show()

In [None]:
#Drop unnecessary columns like 'Unnamed: 0'
train_df.drop(columns=['Unnamed: 0'], inplace=True)
test_df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
#Handle missing values and duplicate entries
# Check missing values
print("Missing values in train:\n", train_df.isnull().sum())
print("\nMissing values in test:\n", test_df.isnull().sum())

# Drop duplicates if any
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [None]:
#Convert dob,trans_date_trans_time column in both test & train to datetime data type and creating new 'trans_date' column
for df in [train_df, test_df]:
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
    df['trans_date'] = df['trans_date_trans_time'].dt.date

In [None]:
#Check the Fraud and the Normal transaction numbers for test and train datasets
print("Train Set:")
print(train_df['is_fraud'].value_counts())
print("\nTest Set:")
print(test_df['is_fraud'].value_counts())

In [None]:
#Merge train and test datasets for exploratory data analysis
combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print("Combined dataset shape:", combined_df.shape)

## Activity 5: Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fraud_df = combined_df[combined_df['is_fraud'] == 1]

In [None]:
#Count plot of Transactions by top 10 Categories in merged datasets and provide the conclusions and insights gathered
top_categories = combined_df['category'].value_counts().nlargest(10)
sns.countplot(data=combined_df[combined_df['category'].isin(top_categories.index)],
              x='category', order=top_categories.index)
plt.xticks(rotation=45)
plt.title("Top 10 Transaction Categories")
plt.show()

In [None]:
#Count plot of Transactions by Gender in merged datasets and provide the conclusions and insights gathered
sns.countplot(data=combined_df, x='gender', hue='is_fraud')
plt.title("Transaction Count by Gender")
plt.show()


In [None]:
#Count plot of Transactions by top 10 Merchants in merged datasets and provide the conclusions and insights gathered
top_merchants = combined_df['merchant'].value_counts().nlargest(10)
sns.countplot(data=combined_df[combined_df['merchant'].isin(top_merchants.index)],
              x='merchant', order=top_merchants.index)
plt.xticks(rotation=45)
plt.title("Top 10 Merchants by Transactions")
plt.show()

In [None]:
#Box plot of transaction amount by category and provide the conclusions and insights gathered
plt.figure(figsize=(12, 6))
sns.boxplot(data=combined_df[combined_df['category'].isin(top_categories.index)],
            x='category', y='amt')
plt.xticks(rotation=45)
plt.title("Transaction Amount Distribution by Category")
plt.show()

In [None]:
#A histogram of the transaction amount and provide the conclusions and insights gathered
plt.figure(figsize=(10, 5))
sns.histplot(combined_df['amt'], bins=50, kde=True)
plt.title("Histogram of Transaction Amounts")
plt.xlabel("Amount")
plt.show()

In [None]:
#Top Categories of the Fraudulent Transactions and provide the conclusions and insights gathered
fraud_by_category = fraud_df['category'].value_counts().nlargest(10)
sns.barplot(x=fraud_by_category.index, y=fraud_by_category.values)
plt.xticks(rotation=45)
plt.title("Top Categories with Fraudulent Transactions")
plt.ylabel("Fraud Count")
plt.show()

In [None]:
#State-wise Analysis of Fraudulent Transactions and provide the conclusions and insights gathered
fraud_by_state = fraud_df['state'].value_counts().nlargest(10)
sns.barplot(x=fraud_by_state.index, y=fraud_by_state.values)
plt.title("Top 10 States with Fraudulent Transactions")
plt.ylabel("Fraud Count")
plt.show()

In [None]:
#City-wise Analysis of Fraudulent Transactions and provide the conclusions and insights gathered
fraud_by_city = fraud_df['city'].value_counts().nlargest(10)
sns.barplot(x=fraud_by_city.index, y=fraud_by_city.values)
plt.title("Top 10 Cities with Fraudulent Transactions")
plt.ylabel("Fraud Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
#Number of Credit Card Frauds by Job and provide the conclusions and insights gathered
fraud_by_job = fraud_df['job'].value_counts().nlargest(10)
sns.barplot(x=fraud_by_job.index, y=fraud_by_job.values)
plt.title("Top Jobs with Fraudulent Transactions")
plt.xticks(rotation=45)
plt.ylabel("Fraud Count")
plt.show()

In [None]:
#Number of Credit Card Frauds by Age Groups and provide the conclusions and insights gathered
combined_df['age'] = 2020 - combined_df['dob'].dt.year
combined_df['age_group'] = pd.cut(combined_df['age'], bins=[18, 30, 40, 50, 60, 80], labels=["18-30", "31-40", "41-50", "51-60", "61-80"])
fraud_age_group = combined_df[combined_df['is_fraud'] == 1]['age_group'].value_counts()

sns.barplot(x=fraud_age_group.index, y=fraud_age_group.values)
plt.title("Fraud Count by Age Group")
plt.ylabel("Fraud Count")
plt.show()

In [None]:
#Number of Credit Card Frauds by Gender and provide the conclusions and insights gathered
sns.countplot(data=fraud_df, x='gender')
plt.title("Fraud Count by Gender")
plt.show()

In [None]:
#Number of Credit Card Frauds by Year and provide the conclusions and insights gathered
fraud_df['year'] = pd.to_datetime(fraud_df['trans_date_trans_time']).dt.year
sns.countplot(data=fraud_df, x='year')
plt.title("Fraud Count by Year")
plt.show()

In [None]:
#Number of Credit Card Frauds by latitudinal distance and provide the conclusions and insights gathered
sns.histplot(fraud_df['lat'], bins=30, kde=True)
plt.title("Fraud Distribution by Latitude")
plt.xlabel("Latitude")
plt.show()

In [None]:
#Number of Credit Card Frauds by longitudinal distance and provide the conclusions and insights gathered
sns.histplot(fraud_df['long'], bins=30, kde=True)
plt.title("Fraud Distribution by Longitude")
plt.xlabel("Longitude")
plt.show()

## Activity 6: Feature Engineering

In [None]:
#Encode categorical variables using techniques suitable for the model, such as one-hot encoding
df_encoded = combined_df.copy()

one_hot_features = ['category', 'gender', 'state', 'city', 'merchant']

df_encoded = pd.get_dummies(df_encoded, columns=one_hot_features, drop_first=True)


In [None]:
#Drop highly correlated or redundant features to reduce dimensionality
import numpy as np

corr_matrix = df_encoded.select_dtypes(include='number').corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

df_encoded.drop(columns=to_drop, inplace=True)

In [None]:
#Label encode categorical features
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_encoded['job'] = le.fit_transform(df_encoded['job'])

df_encoded['trans_date'] = pd.to_datetime(df_encoded['trans_date'])
df_encoded['year'] = df_encoded['trans_date'].dt.year
df_encoded['month'] = df_encoded['trans_date'].dt.month

df_encoded.drop(columns=['trans_date', 'trans_date_trans_time', 'dob'], inplace=True)

In [None]:
#Normalize or standardize numerical features to improve model performance
from sklearn.preprocessing import StandardScaler

numeric_features = df_encoded.select_dtypes(include=['int64', 'float64']).drop(columns=['is_fraud']).columns

scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

In [None]:
#Save the final dataset as 'Capstone_Dataset.csv'
df_encoded.to_csv('Capstone_Dataset.csv', index=False)
print("Final dataset saved as 'Capstone_Dataset.csv'")

## Activity 7: Model Training and Evaluation

In [None]:
#Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = df_encoded.drop('is_fraud', axis=1)
y = df_encoded['is_fraud']

# Split 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
#Train a logistic regression, Decision Tree, Random Forest, Adaboost, GaussianNB, KNN classifier, and LightGBM Classifier, model to predict fraudulent transactions
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "LightGBM": LGBMClassifier()
}


In [None]:
#Evaluate the model using metrics such as accuracy, precision, recall, and the F1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

In [None]:
#Utilize confusion matrix and classification reports to assess performance
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Classification Report
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()