In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from scipy.stats import zscore

# Load data from CSV file
df = pd.read_csv('Final Transactions.csv')

# Display basic info about the dataset
print(df.info())
print(df.describe())
print(df.head())

# Define features (X) and target variable (y)
X = df.drop(columns=['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD_SCENARIO'])
y = df['TX_FRAUD']

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing to the data
X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed data back to a DataFrame to handle z-score calculation easily
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=[f"num_{i}" for i in range(X_preprocessed.shape[1])])

# Handle outliers using Z-score method
z_scores = np.abs(zscore(X_preprocessed_df.select_dtypes(include=[np.float64, np.int64])))
X_preprocessed_df = X_preprocessed_df[(z_scores < 3).all(axis=1)]
y = y[X_preprocessed_df.index]

# Ensure X_preprocessed_df is back to array after handling outliers for further processing
X_preprocessed = X_preprocessed_df.to_numpy()

# Feature selection: Select top k features, or skip this step if not needed
k = min(10, X_preprocessed.shape[1])  # Adjust k to be less than or equal to the number of features
selector = SelectKBest(f_classif, k=k)
X_selected = selector.fit_transform(X_preprocessed, y)

# Apply PCA only if there are enough features
if X_selected.shape[1] > 1:  # Proceed with PCA only if there are more than 1 feature
    pca_components = min(5, X_selected.shape[1])  # Ensure n_components <= number of features
    pca = PCA(n_components=pca_components)
    X_pca = pca.fit_transform(X_selected)
else:
    X_pca = X_selected  # If PCA is not applicable, use the selected features as is

# Data augmentation with SMOTE to handle imbalanced dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_pca, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Initialize and train the Decision Tree model (or any other model)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("Training accuracy:", train_score)
print("Testing accuracy:", test_score)

# Predict and evaluate on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   TRANSACTION_ID     int64  
 2   TX_DATETIME        object 
 3   CUSTOMER_ID        int64  
 4   TERMINAL_ID        int64  
 5   TX_AMOUNT          float64
 6   TX_TIME_SECONDS    int64  
 7   TX_TIME_DAYS       int64  
 8   TX_FRAUD           int64  
 9   TX_FRAUD_SCENARIO  int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 133.8+ MB
None
         Unnamed: 0  TRANSACTION_ID   CUSTOMER_ID   TERMINAL_ID     TX_AMOUNT  \
count  1.754155e+06    1.754155e+06  1.754155e+06  1.754155e+06  1.754155e+06   
mean   8.770770e+05    8.770770e+05  2.504011e+03  4.996733e+03  5.396820e+02   
std    5.063811e+05    5.063811e+05  1.445987e+03  2.886101e+03  1.179711e+03   
min    0.000000e+00    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    4.385385e+05    4.3853