In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
from imblearn.over_sampling import SMOTE

# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
# Read data
file = 'drive/My Drive/Imersao_42_AI_and_Machine_Learning/Transactions.csv'
df = pd.read_csv(file)

print(df.head())

   Unnamed: 0.1  Unnamed: 0  TRANSACTION_ID          TX_DATETIME  CUSTOMER_ID  \
0             0           0               0  2023-01-01 00:00:31          596   
1             1           1               1  2023-01-01 00:02:10         4961   
2             2           2               2  2023-01-01 00:07:56            2   
3             3           3               3  2023-01-01 00:09:29         4128   
4             4           4               4  2023-01-01 00:10:34          927   

   TERMINAL_ID  TX_AMOUNT  TX_TIME_SECONDS  TX_TIME_DAYS  \
0         3156     533.07               31             0   
1         3412     808.56              130             0   
2         1365    1442.94              476             0   
3         8737     620.65              569             0   
4         9906     490.66              634             0   

                 TX_FRAUD_SCENARIO  
0           Legitimate Transaction  
1           Legitimate Transaction  
2  Fraudulent Transaction Internet  
3   

In [None]:
# Data exploration
print("Dimention:")
print(df.shape)

print("\nColumns:")
print(df.columns)

print("\nInformations:")
print(df.info())

Dimention:
(1754155, 10)

Columns:
Index(['Unnamed: 0.1', 'Unnamed: 0', 'TRANSACTION_ID', 'TX_DATETIME',
       'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS',
       'TX_TIME_DAYS', 'TX_FRAUD_SCENARIO'],
      dtype='object')

Informations:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0.1       int64  
 1   Unnamed: 0         int64  
 2   TRANSACTION_ID     int64  
 3   TX_DATETIME        object 
 4   CUSTOMER_ID        int64  
 5   TERMINAL_ID        int64  
 6   TX_AMOUNT          float64
 7   TX_TIME_SECONDS    int64  
 8   TX_TIME_DAYS       int64  
 9   TX_FRAUD_SCENARIO  object 
dtypes: float64(1), int64(7), object(2)
memory usage: 133.8+ MB
None


In [None]:
def plot_fraud_scenario_counts(data_frame):
    """
    Plots the counts of transactions for different fraud scenarios.

    :param data_frame: The DataFrame containing the data.
    """
    counts = data_frame['TX_FRAUD_SCENARIO'].value_counts().reset_index()
    counts.columns = ['Fraud Scenario', 'Number of Transactions']

    fig = px.bar(counts, x='Fraud Scenario', y='Number of Transactions',
                 color='Fraud Scenario')
    fig.show()

# Call the function to plot fraud scenario counts
plot_fraud_scenario_counts(df)

In [None]:
# Add Function

def create_binary_target(df, target_col):
    """
    Creates a binary target column for fraud classification.
    Legitimate transactions remain as 'Legitimate Transaction',
    all other types are grouped as 'Fraudulent Transaction'.

    :param df: DataFrame containing the original target column.
    :param target_col: Name of the original target column.
    :return: DataFrame with new binary target column.
    """
    df = df.copy()
    df[target_col] = df[target_col].apply(
        lambda x: 'Legitimate Transaction' if x == 'Legitimate Transaction' else 'Fraudulent Transaction'
    )
    return df

df = create_binary_target(df, 'TX_FRAUD_SCENARIO')

In [None]:
# Think about merging your frauds together

def plot_data_imbalance(data_frame, target_column):
    """
    Display a pie chart to illustrate class imbalance in the target column.

    :param data_frame: The DataFrame containing the data.
    :param target_column: The name of the target column for which the imbalance should be illustrated.
    """
    grouped = data_frame[target_column]

    counts = grouped.value_counts().reset_index()
    counts.columns = ['Class', 'Count']

    fig = px.pie(
        counts,
        names='Class',
        values='Count',
        color_discrete_sequence=px.colors.qualitative.Set2,
    )

    fig.update_traces(textinfo='percent+label')
    fig.show()

# Call the function to display class imbalance in the 'TX_FRAUD_SCENARIO' column
plot_data_imbalance(df, 'TX_FRAUD_SCENARIO')

In [None]:
def visualize_monthly_transactions(df):
    """
    Visualizes the distribution of transactions per month and their fraud scenario using a grouped bar plot.

    :param df: DataFrame containing transaction data.
    """
    df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
    df['MONTH_NUM'] = df['TX_DATETIME'].dt.month

    grouped = df.groupby(['MONTH_NUM', 'TX_FRAUD_SCENARIO']).size().reset_index(name='Count')

    fig = px.bar(
        grouped,
        x='MONTH_NUM',
        y='Count',
        color='TX_FRAUD_SCENARIO',
        barmode='stack',
        title='Monthly Distribution of Transactions and Fraud Scenario',
        labels={
            'MONTH_NUM': 'Month',
            'Count': 'Number of Transactions',
            'TX_FRAUD_SCENARIO': 'Transaction Type'
            },
        color_discrete_sequence=px.colors.qualitative.Set2,
        category_orders={"TX_FRAUD_SCENARIO": ["Legitimate Transaction", "Fraudulent Transaction"]}
    )
    fig.show()

# Call the function to visualize monthly transactions and fraud scenarios
visualize_monthly_transactions(df)

In [None]:
# Add new function

def add_time_features(df, datetime_col):
    """
    Extracts temporal features from a datetime column and adds them to the DataFrame.

    :param df: DataFrame containing the datetime column.
    :param datetime_col: The name of the datetime column.
    :return: DataFrame with new temporal features.
    """
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col])

    df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
    df['TX_MONTH'] = df[datetime_col].dt.month
    df['TX_DAYOFWEEK'] = df[datetime_col].dt.dayofweek
    df['TX_HOUR'] = df[datetime_col].dt.hour

    return df

In [None]:
# Add new function

def encode_binary_target(df, target_col):
    """
    Encode target column as binary values: 0 for Legitimate, 1 for Fraudulent.

    :param df: DataFrame with the original binary target column (strings)
    :param target_col: Name of the column to encode
    :return: DataFrame with encoded target column
    """
    df[target_col] = df[target_col].map({
        'Legitimate Transaction': 0,
        'Fraudulent Transaction': 1
        })

    return df

**Smote**

SMOTE (Synthetic Minority Over-sampling Technique) é uma técnica para balancear datasets desbalanceados — especialmente útil em problemas de classificação binária com classes desproporcionais, como detecção de fraudes, diagnósticos médicos, etc.

Em vez de simplesmente duplicar registros da classe minoritária, o SMOTE cria novos exemplos sintéticos, baseados em exemplos reais da minoria.

**O SMOTE faz o seguinte:**

*    Para cada exemplo da minoria (x), ele:

Encontra os k vizinhos mais próximos da mesma classe (k=5 por padrão).

Escolhe um ou mais vizinhos aleatoriamente.

*    Para cada exemplo da minoria (x), ele:

Gera novos pontos "entre" os exemplos reais e os vizinhos:

Cria um vetor entre o ponto original e o vizinho

Multiplica esse vetor por um número aleatório entre 0 e 1

Adiciona esse valor ao ponto original → isso gera um novo exemplo sintético

**Aplicando o SMOTE:**

    smote = SMOTE(random_state=20)
    X_resampled, y_resampled = smote.fit_resample(X_numeric, y)

fit_resample() aprende os padrões da classe minoritária e gera novas amostras sintéticas

Resultado: X_resampled e y_resampled agora têm o mesmo número de exemplos em cada classe (0 e 1)

**Vantagem:**

*Duplicar* = overfitting (modelo aprende "de cor" os poucos exemplos da minoria)

*SMOTE* = mais variedade e distribuição nos dados sintéticos → generaliza melhor

**Limitações:**

*    Não lida bem com outliers da minoria → pode gerar pontos sintéticos em áreas irrelevantes.
*    Pode criar exemplos no espaço da classe majoritária, se as classes estiverem muito misturadas (overlap).
*    Funciona melhor com dados numéricos — por isso é comum aplicar um StandardScaler antes.

---

**Outros tipos de variações**

*    Borderline-SMOTE: foca nos pontos próximos das fronteiras de decisão.
*    SMOTE-NC: para dados com colunas categóricas.
*    ADASYN: foca mais nas regiões onde a minoria está mais escassa.

---
Seu problema tem um forte componente temporal:

As fraudes podem mudar de padrão com o tempo.

O modelo precisa ser avaliado como se estivesse operando no "futuro", ou seja:

Treina no passado

Isso é chamado de validade temporal, e evita o chamado data leakage (vazamento de informação do futuro para o passado durante o treino).

In [None]:
# from imblearn.over_sampling import SMOTE

def balance_data_with_smote(df, target_column):
    """
    Balances the dataset using SMOTE to oversample the minority class.

    :param df: The DataFrame containing the data.
    :param target_column: The name of the target column for which the balancing is performed.
    :return: A balanced DataFrame.
    """
    df = df.copy()
    df = add_time_features(df, "TX_DATETIME")
    df = encode_binary_target(df, target_column)

    cols_to_drop = ['Unnamed: 0.1', 'Unnamed: 0', 'TRANSACTION_ID', 'TX_DATETIME',
                    'CUSTOMER_ID', 'TERMINAL_ID', 'MONTH_NUM']

    cols_to_drop = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop)

    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_numeric = X.select_dtypes(include=[np.number])

    smote = SMOTE(random_state=20)
    X_resampled, y_resampled = smote.fit_resample(X_numeric, y)

    balanced_df = pd.DataFrame(X_resampled, columns=X_numeric.columns)
    balanced_df[target_column] = y_resampled

    return balanced_df

In [None]:
def plot_data_imbalance(data_frame, target_column):
    """
    Display a pie chart to illustrate class imbalance in the target column.

    :param data_frame: The DataFrame containing the data.
    :param target_column: The name of the target column for which the imbalance should be illustrated.
    """
    temp_labels = data_frame[target_column].map({
        0: 'Legitimate Transaction',
        1: 'Fraudulent Transaction'
    })

    counts = temp_labels.value_counts().reset_index()
    counts.columns = ['Class', 'Count']

    fig = px.pie(
        counts,
        names='Class',
        values='Count',
        color_discrete_sequence=px.colors.qualitative.Set2,
    )

    fig.update_traces(textinfo='percent+label')
    fig.show()

# Call the function to display class imbalance in the 'TX_FRAUD_SCENARIO' column
balanced_df = balance_data_with_smote(df, 'TX_FRAUD_SCENARIO')
plot_data_imbalance(balanced_df, 'TX_FRAUD_SCENARIO')


In [None]:
# It's up to you to continue
# Remember to look in the module subject for more information on what to do next and on the benefits of this module, the following exercises are no longer guided.

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

X = balanced_df.drop(columns=['TX_FRAUD_SCENARIO'])
y = balanced_df['TX_FRAUD_SCENARIO']  # Já está como 0 e 1 após o balanceamento

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20, stratify=y
)

# =========================
# XGBoost
# =========================
xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state=20,
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8
)

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Results")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# -----------------------------------
# from sklearn.neighbors import KNeighborsClassifier
# =========================
# K-Nearest Neighbors
# =========================

# Escalar features para KNN
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# knn_model = KNeighborsClassifier(n_neighbors=5)  # você pode testar n_neighbors=3,7,9...
# knn_model.fit(X_train_scaled, y_train)
# y_pred_knn = knn_model.predict(X_test_scaled)

# print("KNN Results")
# print("Accuracy:", accuracy_score(y_test, y_pred_knn))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
# print("Classification Report:\n", classification_report(y_test, y_pred_knn, target_names=['Legitimate', 'Fraudulent']))

# -----------------------------------
# from sklearn.model_selection import train_test_split

# X = balanced_df.drop(columns=['TX_FRAUD_SCENARIO'])
# y = balanced_df['TX_FRAUD_SCENARIO']

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y,
#     test_size=0.2,
#     random_state=20)

# # Importar modelos e métricas
# from xgboost import XGBClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# # Função para treinar, prever e avaliar um modelo
# def train_evaluate_model(model, X_train, X_test, y_train, y_test):
#     """
#     Treina o modelo, faz previsões no teste e retorna métricas básicas.

#     :param model: modelo sklearn
#     :param X_train, X_test, y_train, y_test: datasets
#     """
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     # Mapear classes para números

#     acc = accuracy_score(y_test, y_pred)
#     cm = confusion_matrix(y_test, y_pred)
#     report = classification_report(y_test, y_pred, target_names=['Legitimate Transaction', 'Fraudulent Transaction'])

#     print(f"Model: {model.__class__.__name__}")
#     print(f"Accuracy: {acc:.4f}")
#     print("Confusion Matrix:")
#     print(cm)
#     print("Classification Report:")
#     print(report)
#     print("-" * 50)

#     return y_pred

# y_train_num = y_train.map({'Legitimate Transaction': 0, 'Fraudulent Transaction': 1})
# y_test_num = y_test.map({'Legitimate Transaction': 0, 'Fraudulent Transaction': 1})

# Inicializar modelos
# log_reg = LogisticRegression(max_iter=1000, random_state=42)
# decision_tree = DecisionTreeClassifier(random_state=20)
# random_forest = RandomForestClassifier(n_estimators=100, random_state=20)

# Treinar e avaliar cada modelo
# y_pred_lr = train_evaluate_model(log_reg, X_train, X_test, y_train, y_test)
# y_pred_dt = train_evaluate_model(decision_tree, X_train, X_test, y_train, y_test)
# y_pred_rf = train_evaluate_model(random_forest, X_train, X_test, y_train, y_test)
# y_pred_xgb = train_evaluate_model(xgb_model, X_train, X_test, y_train, y_test)

# Model: LogisticRegression
# Accuracy: 0.9194
# --------------------------------------------------
# Model: DecisionTreeClassifier
# Accuracy: 0.9663
# --------------------------------------------------
# Model: XGBClassifier
# Accuracy: 0.9801 - seed 20, cod do meio
# Accuracy: 0.9795336980959741 - seed 20, 3/7
# Accuracy: 0.9794919929191882 - seed 20, 2/8
# Accuracy: 0.9847943682845498 - seed 20
# --------------------------------------------------
# KNN Results
# Accuracy: 0.9490543822815034


XGBoost Results
Accuracy: 0.9833716191181919
Confusion Matrix:
 [[303154    483]
 [  9615 294023]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98    303637
           1       1.00      0.97      0.98    303638

    accuracy                           0.98    607275
   macro avg       0.98      0.98      0.98    607275
weighted avg       0.98      0.98      0.98    607275

