## prepro v1.5

In [21]:
import numpy as np # linear algebra
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Librariest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedShuffleSplit


In [22]:
data1 = pd.read_csv('../raw_data/creditcard.csv')


In [23]:
df = data1.copy()
df['Hour'] = (df['Time'] // 3600) % 24

In [24]:
# Separate features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Adjust ratio if needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [25]:
# Initialize RobustScaler
scaler = RobustScaler()

# Apply scaling only to 'Time' and 'Amount'
X_train_smote[['Time', 'Amount']] = scaler.fit_transform(X_train_smote[['Time', 'Amount']])
X_test[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])

In [26]:
# Log transform the 'Amount' column to reduce skewness
X_train_smote['Log_Amount'] = np.log1p(X_train_smote['Amount'])
X_test['Log_Amount'] = np.log1p(X_test['Amount'])

# Drop the original 'Amount' column if needed
X_train_smote.drop(columns=['Amount'], inplace=True)
X_test.drop(columns=['Amount'], inplace=True)


In [27]:

# Apply cyclical transformation
X_train_smote["Hour_sin"] = np.sin(2 * np.pi * X_train_smote["Hour"] / 24)
X_train_smote["Hour_cos"] = np.cos(2 * np.pi * X_train_smote["Hour"] / 24)

X_test["Hour_sin"] = np.sin(2 * np.pi * X_test["Hour"] / 24)
X_test["Hour_cos"] = np.cos(2 * np.pi * X_test["Hour"] / 24)

In [28]:
X_train_smote.drop(columns=["Hour"], inplace=True)
X_test.drop(columns=["Hour"], inplace=True)


In [29]:
X_train_smote['Class'] = y_train_smote
# Compute correlation matrix
correlation_matrix = X_train_smote.corr()

In [30]:
corr = X_train_smote.corr()['Class'].sort_values(ascending=False)


In [31]:
low_corr_features = ['V26', 'V22', 'V25', 'V23', 'V13', 'Time']
X_train_smote.drop(columns=low_corr_features, inplace=True)
X_test.drop(columns=low_corr_features, inplace=True)

In [32]:
# Compute the absolute correlation with the target column
target_corr = correlation_matrix['Class'].abs()

# Select upper triangle of correlation matrix to avoid redundancy
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find pairs of features with correlation greater than 0.85
high_corr_pairs = []
for column in upper.columns:
    high_corr_pairs += [(column, other) for other in upper.index if upper[column][other] > 0.85]

# For each pair of highly correlated features, drop the one with lower correlation to the target
columns_to_drop = []
for feature1, feature2 in high_corr_pairs:
    if abs(target_corr[feature1]) < abs(target_corr[feature2]):
        columns_to_drop.append(feature1)
    else:
        columns_to_drop.append(feature2)

# Drop the selected columns from X_train_smote
X_train_smote.drop(columns=columns_to_drop, inplace=True)

In [35]:
# Model LogisticRegression PreproV1.5

In [36]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(
    class_weight='balanced',  # Handle imbalance class_weight='balanced'    automatically compensates for class imbalance.

    max_iter=1000,
    random_state=42
)
model.fit(X_train_smote, y_train_smote)