In [1]:
# Install necessary packages
!pip install xgboost

# Standard library imports
import hashlib

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer, StandardScaler
from xgboost import XGBClassifier

# Local module imports
from clean_data_helper import *
from feature_summary import export_feature_summary
from plot_histograms import *
from preprocessing_helper import *
from round_to_nearest import round_to_nearest_multiple
from split_dataset_by_missing_and_type import *



In [2]:
data = pd.read_csv('final_proj_data.csv')
print(data.shape)

(10000, 231)


### Rough Cleaning

In [3]:
cleaned_data = remove_columns_with_missing_values(data, 0.25)
cleaned_data = remove_rows_with_missing_values(cleaned_data)

New dataset shape after column removal: (10000, 67)
New dataset shape after removing rows with more than 30% missing columns: (9080, 67)


In [4]:
numeric_df, categorical_df = split_dataset(cleaned_data)
# numeric_df.to_csv('numeric_df.csv', index=False)
# categorical_df.to_csv('categorical_df.csv', index=False)

### Imputation

#### Numericals

In [5]:
df_with_missing, df_without_missing = split_dataframe_by_missing_values(numeric_df)
export_feature_summary(df_with_missing, 'num_sum_missing.csv', unique_threshold = 200)
df_with_missing.to_csv('df_with_missing.csv', index=False)

Feature summary exported to num_sum_missing.csv


In [6]:
result = get_imputation_feature_lists_from_dataset(df_with_missing, missing_threshold=0.2)
print("result", result)

result {'low_unique_features': ['Var7'], 'moderate_unique_features': ['Var65', 'Var144'], 'high_unique_features': ['Var6', 'Var13', 'Var21', 'Var24', 'Var74', 'Var81', 'Var109', 'Var119', 'Var125', 'Var140', 'Var149'], 'high_missing_features': []}


In [7]:
# Assuming df_with_missing is a subset of another DataFrame `df`
# Ensure you create a copy to avoid SettingWithCopyWarning
df_with_missing = df_with_missing.copy()

# Define feature groups
low_unique_features = ['Var7']
moderate_unique_features = ['Var65', 'Var144']
high_unique_features = ['Var6', 'Var13', 'Var21', 'Var24', 'Var74', 
                        'Var81', 'Var109', 'Var119', 'Var125', 
                        'Var140', 'Var149']

# Numerical variables with a common divisor of 7
numerical_divisor_7_group = ['Var7']

# Numerical variables with a common divisor of 9
numerical_divisor_9_group = ['Var144', 'Var65']

# Imputation for low unique features (Mode Imputation)
mode_imputer = SimpleImputer(strategy='most_frequent')
df_with_missing.loc[:, low_unique_features] = mode_imputer.fit_transform(
    df_with_missing[low_unique_features]
)

# Imputation for moderate unique features (Median Imputation)
median_imputer = SimpleImputer(strategy='median')
df_with_missing.loc[:, moderate_unique_features] = median_imputer.fit_transform(
    df_with_missing[moderate_unique_features]
)

# Imputation for high unique features (Mean Imputation)
mean_imputer = SimpleImputer(strategy='mean')
df_with_missing.loc[:, high_unique_features] = mean_imputer.fit_transform(
    df_with_missing[high_unique_features]
)

# Apply rounding to nearest multiple of 7 for Var7
df_with_missing.loc[:, numerical_divisor_7_group] = df_with_missing[
    numerical_divisor_7_group
].apply(lambda x: round_to_nearest_multiple(x, 7))

# Apply rounding to nearest multiple of 9 for Var144 and Var65
for var in numerical_divisor_9_group:
    df_with_missing.loc[:, var] = df_with_missing[var].apply(
        lambda x: round_to_nearest_multiple(x, 9)
    )
print(df_with_missing.head())

     Var6  Var7   Var13  Var21  Var24  Var65  Var74      Var81  Var109  \
0   812.0  14.0  1252.0  156.0    0.0   27.0   14.0  227693.10    32.0   
1  2688.0   7.0  8820.0  364.0    4.0    9.0  210.0   17662.35   112.0   
2  1015.0  14.0  1784.0  136.0    2.0   18.0   98.0  190181.10    32.0   
3   168.0   0.0     0.0   24.0    0.0    9.0    0.0  348843.00    16.0   
4    14.0   0.0     0.0   36.0    0.0    9.0    0.0  235971.00     8.0   

   Var119   Var125  Var140  Var144    Var149  
0   525.0   4743.0   410.0    27.0       0.0  
1  1065.0  44541.0    60.0     9.0  398034.0  
2   625.0  14751.0  5720.0    27.0  554421.0  
3   275.0      0.0     0.0     9.0       0.0  
4    45.0      0.0     0.0     0.0       0.0  


In [8]:
# Concatenate df_with_missing and df_without_missing by columns (axis=1)
imputed_numeric_df = pd.concat([df_with_missing, df_without_missing], axis=1)
imputed_numeric_df.to_csv('imputed_numeric_df.csv', index=False)

#### Categorical

In [9]:
df_with_missing_cat, df_without_missing_cat = split_dataframe_by_missing_values(categorical_df)
export_feature_summary(df_with_missing_cat, 'df_with_missing_cat.csv', unique_threshold = 15)
df_with_missing_cat.to_csv('df_with_missing_cat.csv', index=False)

Feature summary exported to df_with_missing_cat.csv


In [10]:
df_with_missing_cat = df_with_missing_cat.copy()

mode_imputing_columns = ['Var208', 'Var218', 'Var205', 'Var203', 'Var197']
mode_imputer = SimpleImputer(strategy='most_frequent')
df_with_missing_cat[mode_imputing_columns] = mode_imputer.fit_transform(df_with_missing_cat[mode_imputing_columns])

columns_to_impute_with_unknown = ['Var223', 'Var219', 'Var206']
df_with_missing_cat[columns_to_impute_with_unknown] = df_with_missing_cat[columns_to_impute_with_unknown].fillna('Unknown')

columns_to_impute_frequency = ['Var197', 'Var192', 'Var217']
df_with_missing_cat[columns_to_impute_frequency] = mode_imputer.fit_transform(df_with_missing_cat[columns_to_impute_frequency])

In [11]:
imputed_categroical_df = pd.concat([df_without_missing_cat, df_with_missing_cat], axis=1)
full_imputed_data = pd.concat([imputed_numeric_df, imputed_categroical_df], axis=1)
full_imputed_data.shape

(9080, 67)

### Scaling And Transforming

#### Outlier Detection and Removal

In [12]:
y = imputed_numeric_df['y']
X_numeric = imputed_numeric_df.drop(columns=['y'])

# Apply the outlier removal function to the features only
outliners_cleaned_numeric_df = remove_outlier_features(X_numeric, outlier_percentage_threshold=7, iqr_threshold=3)

# Concatenate the cleaned features with the target column 'y'
outliners_cleaned_numeric_df = pd.concat([outliners_cleaned_numeric_df, y], axis=1)

DataFrame before removing features:
(9080, 38)

Features with more than 7% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var35             777            8.557269
Var78             717            7.896476
Var113            840            9.251101
Var132           1632           17.973568

DataFrame after removing features with more than 7% outliers:
(9080, 34)


In [13]:
power_transform_cols = [
    'Var6', 'Var13', 'Var21', 'Var24', 'Var65', 'Var74', 'Var109', 'Var119', 
    'Var125', 'Var140', 'Var44', 'Var83', 'Var85', 'Var112', 'Var123', 
    'Var143', 'Var73', 'Var76', 'Var134', 'Var133', 'Var160', 'Var163', 
    'Var173', 'Var181'
]
min_max_scaler_col = 'Var57'
standart_scaler_cols = ['Var7', 'Var153']

pt = PowerTransformer(method='yeo-johnson')
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

outliners_cleaned_numeric_df[power_transform_cols] = pt.fit_transform(outliners_cleaned_numeric_df[power_transform_cols])
outliners_cleaned_numeric_df[min_max_scaler_col] = min_max_scaler.fit_transform(outliners_cleaned_numeric_df[[min_max_scaler_col]])
outliners_cleaned_numeric_df[standart_scaler_cols] = standard_scaler.fit_transform(outliners_cleaned_numeric_df[standart_scaler_cols])

### Encoding

In [14]:
imputed_categroical_df.to_csv('imputed_categroical_df.csv', index=False)

In [15]:
binary_columns = ['Var218', 'Var208', 'Var211']
one_hot_columns = ['Var196', 'Var205', 'Var203', 'Var223', 'Var210', 'Var227', 'Var221']
frequency_encode_columns = ['Var207', 'Var195', 'Var219', 'Var206', 'Var226']
high_cardinality_features = [
    'Var192', 'Var216', 'Var199', 'Var222', 
    'Var220', 'Var198', 'Var202', 'Var217',
    'Var228', 'Var193', 'Var212', 'Var204',
    'Var197'
]

label_encoder = LabelEncoder()
target_encoder = TargetEncoder(cols=high_cardinality_features)

for col in binary_columns:
    imputed_categroical_df[col] = label_encoder.fit_transform(imputed_categroical_df[col])
imputed_categroical_df = pd.get_dummies(imputed_categroical_df, columns=one_hot_columns)

for col in frequency_encode_columns:
    freq = imputed_categroical_df[col].value_counts()
    imputed_categroical_df[col] = imputed_categroical_df[col].map(freq)


imputed_categroical_df[high_cardinality_features] = target_encoder.fit_transform(imputed_categroical_df[high_cardinality_features],
                                                                                 full_imputed_data['y'])
encoded_cat_df = imputed_categroical_df

### Model try

In [16]:
modeling_df = pd.concat([outliners_cleaned_numeric_df, encoded_cat_df], axis=1)

# Make sure 'y' is dropped from training features
X = modeling_df.drop(columns=['y'])
y = modeling_df['y']

In [17]:
# Set up StratifiedKFold and SMOTE
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define model
model = XGBClassifier(eval_metric='logloss', random_state=42)

# Define F1 score as our metric
f1_scorer = make_scorer(f1_score, average='macro')

# List to store metrics for each fold
f1_scores = []
accuracies = []

# Stratified K-Fold Cross-Validation
for train_index, val_index in strat_kfold.split(X, y):
    # Split the data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Apply SMOTE to the training data
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Fit the model on the balanced training data
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate and store metrics
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)
    
    # Evaluate the model
    print("Fold Results:")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Calculate and print the average scores across folds
print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))


Fold Results:
Accuracy: 0.9757709251101322
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1575
           1       0.91      0.91      0.91       241

    accuracy                           0.98      1816
   macro avg       0.95      0.95      0.95      1816
weighted avg       0.98      0.98      0.98      1816

Fold Results:
Accuracy: 0.9724669603524229
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1575
           1       0.89      0.90      0.90       241

    accuracy                           0.97      1816
   macro avg       0.94      0.94      0.94      1816
weighted avg       0.97      0.97      0.97      1816

Fold Results:
Accuracy: 0.9757709251101322
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      1574
           1       0.89      0.93   

### Test Data Preprocessing

In [18]:
test_data = pd.read_csv('final_proj_test.csv')

# Define columns to retain even if they aren't in the intersection
columns_to_save = ['Var205', 'Var203', 'Var223', 'Var196', 'Var210', 'Var227', 'Var221']

# Find common columns between modeling_df and test_data
common_columns = modeling_df.columns.intersection(test_data.columns)

# Combine common columns with columns_to_save, ensuring no duplicates
final_columns = common_columns.union(columns_to_save)

# Align test_data with these final columns, filling missing columns as needed
test_df_aligned = test_data.reindex(columns=final_columns)

In [19]:
test_df_aligned.loc[:, low_unique_features] = mode_imputer.fit_transform(
    test_df_aligned[low_unique_features]
)

test_df_aligned.loc[:, moderate_unique_features] = median_imputer.fit_transform(
    test_df_aligned[moderate_unique_features]
)

test_df_aligned.loc[:, high_unique_features] = mean_imputer.fit_transform(
    test_df_aligned[high_unique_features]
)

test_df_aligned.loc[:, numerical_divisor_7_group] = test_df_aligned[
    numerical_divisor_7_group
].apply(lambda x: round_to_nearest_multiple(x, 7))

for var in numerical_divisor_9_group:
    test_df_aligned.loc[:, var] = test_df_aligned[var].apply(
        lambda x: round_to_nearest_multiple(x, 9)
    )
print(test_df_aligned.head())

   Var109  Var112  Var119  Var123   Var125   Var13     Var133    Var134  \
0    32.0    48.0   595.0    78.0   5688.0     0.0  4101810.0  807760.0   
1   152.0    80.0  1320.0    36.0  16470.0  3864.0   689795.0  327752.0   
2    48.0    40.0   605.0    54.0      0.0     0.0  4018990.0  796264.0   
3    40.0    40.0   785.0   114.0  48267.0  5192.0  3744305.0   13754.0   
4    32.0    40.0   395.0     6.0    387.0    16.0  1536535.0  104290.0   

   Var140  Var143  ...     Var57    Var6  Var65  Var7  Var73  Var74  \
0    20.0     0.0  ...  1.667165   819.0    9.0   7.0    168    0.0   
1  1835.0     0.0  ...  5.056398  3192.0   45.0  28.0     46  518.0   
2     0.0     0.0  ...  3.020936   756.0    9.0   0.0     28    0.0   
3  2905.0     0.0  ...  4.014740  3892.0   27.0  21.0     88  371.0   
4   430.0     0.0  ...  1.277505   672.0    9.0   7.0     56    0.0   

       Var76      Var81 Var83 Var85  
0  3801816.0  186732.90  15.0   0.0  
1  1217520.0   13126.62   5.0  38.0  
2  21971

In [20]:
test_df_aligned[mode_imputing_columns] = mode_imputer.fit_transform(test_df_aligned[mode_imputing_columns])

test_df_aligned[columns_to_impute_with_unknown] = test_df_aligned[columns_to_impute_with_unknown].fillna('Unknown')

test_df_aligned[columns_to_impute_frequency] = mode_imputer.fit_transform(test_df_aligned[columns_to_impute_frequency])

test_df_aligned.shape

imputed_test_df = test_df_aligned

In [21]:
common_columns = X_numeric.columns.intersection(imputed_test_df.columns)

X_numeric_test = imputed_test_df[common_columns]

outliners_cleaned_test_numeric_df = remove_outlier_features(X_numeric_test, outlier_percentage_threshold=7, iqr_threshold=3)

imputed_test_df[common_columns] = outliners_cleaned_test_numeric_df

DataFrame before removing features:
(2500, 34)

Features with more than 7% outliers and their outlier percentages:
Empty DataFrame
Columns: [Outlier Count, Outlier Percentage]
Index: []

DataFrame after removing features with more than 7% outliers:
(2500, 34)


In [22]:
imputed_test_df[power_transform_cols] = pt.fit_transform(imputed_test_df[power_transform_cols])
imputed_test_df[min_max_scaler_col] = min_max_scaler.fit_transform(imputed_test_df[[min_max_scaler_col]])
imputed_test_df[standart_scaler_cols] = standard_scaler.fit_transform(imputed_test_df[standart_scaler_cols])

In [23]:
# Step 1: Binary encoding for binary columns
for col in binary_columns:
    imputed_test_df[col] = label_encoder.fit_transform(imputed_test_df[col])


imputed_test_df = pd.get_dummies(imputed_test_df, columns=one_hot_columns)

expected_columns = modeling_df.columns  # Columns from the training data
imputed_test_df = imputed_test_df.reindex(columns=expected_columns, fill_value=0)

# Step 3: Frequency encoding for moderate cardinality columns
for col in frequency_encode_columns:
    freq = imputed_test_df[col].value_counts()
    imputed_test_df[col] = imputed_test_df[col].map(freq)

# Step 4: Target encoding for high cardinality features (already fitted on training data)
imputed_test_df[high_cardinality_features] = target_encoder.transform(imputed_test_df[high_cardinality_features])


In [24]:
modelling_test_df = imputed_test_df
print(modelling_test_df.shape)
print(modeling_df.shape)

(2500, 91)
(9080, 91)


In [25]:
# For test data, ensure it aligns with the training feature set
X_test = modelling_test_df.reindex(columns=X.columns, fill_value=0)

# Then, proceed with prediction
y_test_pred = model.predict(X_test)