# EDA
___

This `.ipynb` file will be used for...

## Import packages

In [None]:
import sys
sys.path.append('../')

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import KFold

from utils.read_data import read_data

In [None]:
SEED: int = 8

## Reading data

In [None]:
DATASET_PATH ="../dataset/"

In [None]:
df = read_data(DATASET_PATH)
df.head()

In [None]:
df.info(memory_usage='deep')

In [None]:
df.describe().T

In [None]:
# Transform column names to lowercase
df.columns = df.columns.str.lower()
df.columns

## Exploratory Data Analysis

### Valores faltantes

In [None]:
df.isnull().any()

### Relationship between fraudulent and no fraudulent transactions

In [None]:
df["class"].value_counts(normalize=True)*100

### Distribuition by column

In [None]:
df.hist(figsize=(20, 18), bins=50) # [[f'v{i}' for i in range(1, 29)]]
plt.tight_layout()
plt.show()

In [None]:
features = [f'v{i}' for i in range(1, 29)]
fig = go.Figure()

for i, col in enumerate(features):
    visible = True if i == 0 else 'legendonly'
    fig.add_trace(
        go.Histogram(
            x=df[col],
            name=col,
            visible=visible,
            # opacity=0.75,
            nbinsx=50
        )
    )

fig.update_layout(
    title="Distribution of Features (v1-v28)",
    xaxis_title="Value",
    yaxis_title="Count",
    barmode='overlay',
    margin=dict(l=20, r=20, t=50, b=20),
)
fig.show()

fig.write_html('images/eda_distribution_features.html')

In [None]:
non_feature_cols = [col for col in df.columns if col not in features]
fig = make_subplots(rows=1, cols=2, subplot_titles=non_feature_cols)

for i, col in enumerate(non_feature_cols):
    fig.add_trace(
        go.Histogram(
            x=df[col],
            name=col,
            opacity=0.75,
            nbinsx=50,
            showlegend=False
        ),
        row=1, col=i+1
    )

fig.update_layout(
    title_text="Distribution of Amount and Class Features (non-v1-v28)",
    margin=dict(l=20, r=20, t=50, b=20),
)
fig.show()

fig.write_html('images/eda_distribution_features_non_v1_v28_subplots.html')

### Feature importance

In [None]:
df_full_train, df_test = train_test_split(df, test_size=.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=.15, random_state=SEED)

In [None]:
df_full_train.to_csv('df_full_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)

In [None]:
y_train = df_train["class"].values
y_val = df_val["class"].values
y_test = df_test["class"].values

del df_train['class']
del df_test['class']
del df_val['class']

In [None]:
print(f'Shape y_train: {y_train.shape}')
print(f'Shape y_val: {y_val.shape}')
print(f'Shape y_test: {y_test.shape}')
print(f'Shape df_train: {df_train.shape}')
print(f'Shape df_val: {df_val.shape}')
print(f'Shape df_test: {df_test.shape}')

In [None]:
tmp_model = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='liblinear',
    random_state=SEED,
    max_iter=1000,
    n_jobs=-1
)
tmp_model.fit(df_train, y_train)

In [None]:
feature_importance = pd.DataFrame({
    'feature': df_train.columns,
    'importance': tmp_model.coef_[0]
}).sort_values(by='importance', key=abs, ascending=False)

feature_importance.reset_index(drop=True, inplace=True)
feature_importance.T

In [None]:
plt.figure(figsize=(16, 8))

for feature in df_val.columns:
    # Fit a univariate logistic regression model
    model = LogisticRegression(solver='liblinear', random_state=SEED)
    model.fit(df_train[[feature]], y_train)
    y_val_pred_proba = model.predict_proba(df_val[[feature]])[:, 1]
    fpr, tpr, _ = roc_curve(y_val, y_val_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label=f'{feature} (AUC={roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')    
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Each Feature')
plt.legend(fontsize='small', loc='lower right', ncol=2)
plt.grid(True)
plt.tight_layout()
plt.show()

plt.savefig('images/eda_roc_curve_features.png', dpi=300)

In [None]:
# Get predicted probabilities for each set
y_train_pred_proba = tmp_model.predict_proba(df_train)[:, 1]
y_val_pred_proba = tmp_model.predict_proba(df_val)[:, 1]
y_test_pred_proba = tmp_model.predict_proba(df_test)[:, 1]

# Compute ROC curves
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_proba)
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_pred_proba)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)

# Save results in a DataFrame
roc_df = pd.DataFrame({
    'fpr_train': pd.Series(fpr_train),
    'tpr_train': pd.Series(tpr_train),
    'fpr_val': pd.Series(fpr_val),
    'tpr_val': pd.Series(tpr_val),
    'fpr_test': pd.Series(fpr_test),
    'tpr_test': pd.Series(tpr_test),
})

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=fpr_train, y=tpr_train,
    mode='lines',
    name='Train',
    visible=True
))
fig.add_trace(go.Scatter(
    x=fpr_val, y=tpr_val,
    mode='lines',
    name='Validation',
    visible='legendonly'
))
fig.add_trace(go.Scatter(
    x=fpr_test, y=tpr_test,
    mode='lines',
    name='Test',
    visible='legendonly'
))

fig.update_layout(
    title='ROC Curve (Train, Validation, Test)',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend_title='Dataset',
    width=800,
    height=500
)

fig.show()

fig.write_html('images/roc_curve_train_val_test.html')

### Correlation between features

In [None]:
df_corr = df.corr()
df_corr['class'].abs().sort_values(ascending=False)

In [None]:
f, ax = plt.subplots(figsize=(20,4))

my_plot = sns.heatmap(
    # corr_train,
    df_corr.filter(items=['class']).sort_values(by='class', ascending=False).T,
    cmap=sns.color_palette("coolwarm"),
    linewidths=0.5,
    annot=True,
    vmax=0.8,
    fmt=".2f",
    # square=True
)

my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=45)

#modify individual font size of elements
plt.tick_params(axis='both', which='major', labelsize=12)
plt.xlabel('Variables', fontsize=14)
plt.ylabel('Correlation', fontsize=14)
plt.title('Pearson Correlation with Class', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
plt.savefig('./images/eda_pearson_correlation_heatmap.png', dpi=300)

In [None]:
plt.figure(figsize=(18, 14))
sns.heatmap(df_corr, annot=True, vmax=0.8, fmt=".2f", linewidths=.5, square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()