## Activity 4: Data Preparation and Cleaning for Credit card fraud detection

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
#Load the datasets 'fraudTrain.csv' and 'fraudTest.csv'
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

In [None]:
#Explore the data entries and shape of the datasets
print("Train dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)

print("\nTrain data preview:")
print(train_df.head())

print("\nTest data preview:")
print(test_df.head())

In [None]:
#Check the correlation between numerical features and plot heatmap in both datasets
train_corr = train_df.select_dtypes(include=np.number).corr()
test_corr = test_df.select_dtypes(include=np.number).corr()

# Heatmap for train dataset
plt.title("Correlation Heatmap - Train")
sns.heatmap(train_corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

# Heatmap for test dataset
plt.title("Correlation Heatmap - Test")
sns.heatmap(test_corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

In [None]:
#Drop unnecessary columns like 'Unnamed: 0'
train_df.drop(columns=["Unnamed: 0"], inplace=True)
test_df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
#Handle missing values and duplicate entries
# Check missing values
print("\nMissing values in train dataset:")
print(train_df.isnull().sum())

print("\nMissing values in test dataset:")
print(test_df.isnull().sum())

# Drop duplicate entries
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [None]:
#Convert dob,trans_date_trans_time column in both test & train to datetime data type and creating new 'trans_date' column
for df in [train_df, test_df]:
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
    df['trans_date'] = df['trans_date_trans_time'].dt.date

In [None]:
#Check the Fraud and the Normal transaction numbers for test and train datasets
print("\nTrain dataset fraud vs normal:")
print(train_df['is_fraud'].value_counts())

print("\nTest dataset fraud vs normal:")
print(test_df['is_fraud'].value_counts())

In [None]:
#Merge train and test datasets for exploratory data analysis
merged_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print("\nMerged dataset shape:", merged_df.shape)

## Activity 5: Exploratory Data Analysis (EDA)

In [None]:
#Count plot of Transactions by top 10 Categories in merged datasets and provide the conclusions and insights gathered
top_categories = merged_df['category'].value_counts().nlargest(10)

sns.countplot(data=merged_df[merged_df['category'].isin(top_categories.index)],
              y='category', order=top_categories.index, palette='viridis')
plt.title("Top 10 Categories by Number of Transactions")
plt.xlabel("Count")
plt.ylabel("Category")
plt.show()

In [None]:
#Count plot of Transactions by Gender in merged datasets and provide the conclusions and insights gathered
sns.countplot(data=merged_df, x='gender', palette='Set2')
plt.title("Transactions by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

In [None]:
#Count plot of Transactions by top 10 Merchants in merged datasets and provide the conclusions and insights gathered
top_merchants = merged_df['merchant'].value_counts().nlargest(10)

sns.countplot(data=merged_df[merged_df['merchant'].isin(top_merchants.index)],
              y='merchant', order=top_merchants.index, palette='mako')
plt.title("Top 10 Merchants by Number of Transactions")
plt.xlabel("Count")
plt.ylabel("Merchant")
plt.show()

In [None]:
#Box plot of transaction amount by category and provide the conclusions and insights gathered
sns.boxplot(data=merged_df, x='category', y='amt')
plt.xticks(rotation=90)
plt.title("Transaction Amount by Category")
plt.ylabel("Transaction Amount ($)")
plt.show()

In [None]:
#A histogram of the transaction amount and provide the conclusions and insights gathered
sns.histplot(merged_df['amt'], bins=50, kde=True, color='skyblue')
plt.title("Distribution of Transaction Amounts")
plt.xlabel("Amount ($)")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Top Categories of the Fraudulent Transactions and provide the conclusions and insights gathered
fraud_df = merged_df[merged_df['is_fraud'] == 1]
top_fraud_categories = fraud_df['category'].value_counts().nlargest(10)

sns.countplot(data=fraud_df[fraud_df['category'].isin(top_fraud_categories.index)],
              y='category', order=top_fraud_categories.index, palette='flare')
plt.title("Top Fraud Categories")
plt.xlabel("Count")
plt.ylabel("Category")
plt.show()

In [None]:
#State-wise Analysis of Fraudulent Transactions and provide the conclusions and insights gathered
fraud_by_state = fraud_df['state'].value_counts().nlargest(10)

sns.barplot(x=fraud_by_state.values, y=fraud_by_state.index, palette='magma')
plt.title("Top 10 States with Fraudulent Transactions")
plt.xlabel("Number of Fraudulent Transactions")
plt.ylabel("State")
plt.show()


In [None]:
#City-wise Analysis of Fraudulent Transactions and provide the conclusions and insights gathered
fraud_by_city = fraud_df['city'].value_counts().nlargest(10)

sns.barplot(x=fraud_by_city.values, y=fraud_by_city.index, palette='rocket')
plt.title("Top 10 Cities with Fraudulent Transactions")
plt.xlabel("Number of Fraudulent Transactions")
plt.ylabel("City")
plt.show()


In [None]:
#Number of Credit Card Frauds by Job and provide the conclusions and insights gathered
fraud_by_job = fraud_df['job'].value_counts().nlargest(10)

sns.barplot(x=fraud_by_job.values, y=fraud_by_job.index, palette='cool')
plt.title("Top 10 Jobs with Fraudulent Transactions")
plt.xlabel("Number of Frauds")
plt.ylabel("Job")
plt.show()

In [None]:
#Number of Credit Card Frauds by Age Groups and provide the conclusions and insights gathered
# Hitung usia
fraud_df['age'] = (pd.to_datetime('2020-01-01') - fraud_df['dob']).dt.days // 365

# Buat kelompok usia
fraud_df['age_group'] = pd.cut(fraud_df['age'], bins=[18, 25, 35, 45, 55, 65, 100],
                               labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])

sns.countplot(data=fraud_df, x='age_group', palette='crest')
plt.title("Frauds by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Frauds")
plt.show()

In [None]:
#Number of Credit Card Frauds by Gender and provide the conclusions and insights gathered
sns.countplot(data=fraud_df, x='gender', palette='Set3')
plt.title("Frauds by Gender")
plt.xlabel("Gender")
plt.ylabel("Number of Frauds")
plt.show()


In [None]:
#Number of Credit Card Frauds by Year and provide the conclusions and insights gathered
fraud_df['year'] = pd.to_datetime(fraud_df['trans_date_trans_time']).dt.year

sns.countplot(data=fraud_df, x='year', palette='YlGnBu')
plt.title("Frauds by Year")
plt.xlabel("Year")
plt.ylabel("Number of Frauds")
plt.show()

In [None]:
#Number of Credit Card Frauds by latitudinal distance and provide the conclusions and insights gathered
sns.histplot(fraud_df['lat'], bins=30, kde=True, color='darkred')
plt.title("Distribution of Fraud by Latitude")
plt.xlabel("Latitude")
plt.ylabel("Number of Frauds")
plt.show()

In [None]:
#Number of Credit Card Frauds by longitudinal distance and provide the conclusions and insights gathered
sns.histplot(fraud_df['long'], bins=30, kde=True, color='darkblue')
plt.title("Distribution of Fraud by Longitude")
plt.xlabel("Longitude")
plt.ylabel("Number of Frauds")
plt.show()

## Activity 6: Feature Engineering

In [None]:
#Encode categorical variables using techniques suitable for the model, such as one-hot encoding
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Salin dataframe untuk diproses
df = merged_df.copy()

# One-hot encode untuk 'category' dan 'gender'
df = pd.get_dummies(df, columns=['category', 'gender'], drop_first=True)


In [None]:
#Drop highly correlated or redundant features to reduce dimensionality
redundant_cols = ['trans_date_trans_time', 'trans_date', 'dob', 'unix_time',
                  'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'state', 'trans_num']
df.drop(columns=redundant_cols, inplace=True, errors='ignore')

In [None]:
#Label encode categorical features
if df['job'].dtype == 'object':
    le = LabelEncoder()
    df['job'] = le.fit_transform(df['job'])

In [None]:
#Normalize or standardize numerical features to improve model performance
from sklearn.preprocessing import StandardScaler

# Tambahkan kolom 'age' berdasarkan kolom dob
df['age'] = (pd.to_datetime("2020-01-01") - pd.to_datetime(merged_df['dob'])).dt.days // 365

# Normalisasi fitur numerik
scaler = StandardScaler()
numerical_cols = ['amt', 'lat', 'long', 'zip', 'age']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
#Save the final dataset as 'Capstone_Dataset.csv'
df.to_csv("Capstone_Dataset.csv", index=False)

## Activity 7: Model Training and Evaluation

In [None]:
#Split the data into training and testing sets
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Fitur & label
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# Stratified split agar distribusi fraud seimbang
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Oversampling
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

In [None]:
#Train a logistic regression, Decision Tree, Random Forest, Adaboost, GaussianNB, KNN classifier, and LightGBM Classifier, model to predict fraudulent transactions
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

# Model dictionary
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "AdaBoost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "LightGBM": LGBMClassifier()
}

# Latih semua model
trained_models = {}
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    trained_models[name] = model

In [None]:
#Evaluate the model using metrics such as accuracy, precision, recall, and the F1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("\n Model Evaluation Metrics:\n")

for name, model in trained_models.items():
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f" {name}")
    print(f"   - Accuracy : {acc:.4f}")
    print(f"   - Precision: {prec:.4f}")
    print(f"   - Recall   : {rec:.4f}")
    print(f"   - F1-score : {f1:.4f}\n")

In [None]:
#Utilize confusion matrix and classification reports to assess performance
from sklearn.metrics import confusion_matrix, classification_report

print("\n Confusion Matrices & Classification Reports:\n")

for name, model in trained_models.items():
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4, zero_division=0)

    print(f" {name}")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", report)
    print("-" * 60)
