In [1]:
# Install necessary packages
!pip install xgboost

# Standard library imports
import hashlib

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer, StandardScaler
from xgboost import XGBClassifier

# Local module imports
from clean_data_helper import *
from feature_summary import export_feature_summary
from plot_histograms import *
from preprocessing_helper import *
from round_to_nearest import round_to_nearest_multiple
from split_dataset_by_missing_and_type import *
from numeric_imputer_helper import *
from categorical_imputer_helper import *
from transformer_helper import *
from encoder_helper import *



In [2]:
data = pd.read_csv('final_proj_data.csv')
test_data = pd.read_csv('final_proj_test.csv')

print(data.shape)
print(test_data.shape)

(10000, 231)
(2500, 230)


### Rough Cleaning

In [3]:
cleaned_data = remove_columns_with_missing_values(data, 0.18)
cleaned_test_data = remove_columns_with_missing_values(test_data, 0.18)

y = cleaned_data['y']
cleaned_data = cleaned_data.drop(columns=['y'])

print(cleaned_data.shape)
print(cleaned_test_data.shape)

New dataset shape after column removal: (10000, 67)
New dataset shape after column removal: (2500, 66)
(10000, 66)
(2500, 66)


In [4]:
numeric_df, categorical_df = split_dataset(cleaned_data)
print('numeric_df.shape: ', numeric_df.shape)
print('categorical_df.shape: ', categorical_df.shape)

numeric_test_df, categorical_test_df = split_dataset(cleaned_test_data)
print('numeric_test_df.shape: ', numeric_test_df.shape)
print('categorical_test_df.shape: ', categorical_test_df.shape)

numeric_df.shape:  (10000, 38)
categorical_df.shape:  (10000, 28)
numeric_test_df.shape:  (2500, 38)
categorical_test_df.shape:  (2500, 28)


### Numerical Features Preprocessing

In [5]:
feature_categories = categorize_features(numeric_df)
imputed_numeric_df = impute_data(numeric_df, feature_categories)

imputed_test_numeric_df = impute_data(numeric_test_df, feature_categories)

print(imputed_numeric_df.shape)
print(imputed_test_numeric_df.shape)

(10000, 38)
(2500, 38)


In [6]:
outliners_cleaned_df = remove_outlier_features(imputed_numeric_df, outlier_percentage_threshold=10, iqr_threshold=3)
outliners_test_cleaned_df = remove_outlier_features(imputed_test_numeric_df, outlier_percentage_threshold=10, iqr_threshold=3)

DataFrame before removing features:
(10000, 38)

Features with more than 10% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var113           1013               10.13
Var132           1632               16.32

DataFrame after removing features with more than 10% outliers:
(10000, 36)
DataFrame before removing features:
(2500, 38)

Features with more than 10% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var113            252               10.08
Var132            388               15.52

DataFrame after removing features with more than 10% outliers:
(2500, 36)


In [7]:
categories = categorize_numeric_features_by_skewness(outliners_cleaned_df, skew_threshold=0.5, unique_value_threshold=10)
transformed_numeric_df, transformers = transform_numeric_features(outliners_cleaned_df, categories)
transformed_test_numeric_df = transform_numeric_features_test(outliners_test_cleaned_df, transformers, categories)

### Categorical Features Preprocessing

In [8]:
categories = categorize_categorical_features(categorical_df, low_threshold=15, medium_threshold=100)
imputed_categorical_df, imputers = impute_categorical_data(categorical_df, categories)
imputed_test_categorical_df = impute_categorical_data_test(categorical_test_df, categories, imputers)
print(imputed_categorical_df.head(2))
print(imputed_test_categorical_df.head(2))

       Var192      Var193 Var195 Var196 Var197   Var198         Var199 Var202  \
0  KXMrEyXXnK  g62hiBSaKg   taul   1K8T   IvdZ  fhk21Ss        Pkku4gO   4MV4   
1  8Knvyx875g     2Knk1KF   taul   1K8T   lK27  eQgnKxT  Hz673939hbJdw   sbhw   

  Var203 Var204  ... Var217 Var218   Var219   Var220 Var221   Var222  \
0   9_Y1   SkZj  ...   G8WR   cJvF  AU8_WTd  4UxGlow   zCkv  catzS2D   
1   9_Y1   MBhA  ...   5smi   UYBR  AU8pNoi  GpvRJ5l   oslk  i06ocsg   

       Var223 Var226 Var227         Var228  
0  jySVZNlOJy   Aoh3   ZI9m  ib5G6X1eUxUn6  
1  LM8l689qOp   WqMG   RAYp        55YFVY9  

[2 rows x 28 columns]
       Var192               Var193 Var195 Var196 Var197   Var198      Var199  \
0  75lTmBtFkL  rEUOq2QD1qfkRr6qpua   taul   1K8T   F9g8  fhk21Ss  GVWDufEPyV   
1  1YVgUHXZeb                 RO12   taul   1K8T   0Xwj  Ml6kiFp  S85OeCBjSz   

  Var202 Var203 Var204  ... Var217 Var218 Var219   Var220 Var221   Var222  \
0   GHne   9_Y1   c2JP  ...   6ITF   cJvF   FzaX  4UxGlow   zCk

In [9]:
encoding_categories = categorize_categorical_features_for_encoding(imputed_categorical_df, low_cardinality_threshold=10, high_cardinality_threshold=100)
encoded_categorical_df, encoders = encode_categorical_features(imputed_categorical_df, encoding_categories, target=y)
encoded_test_categorical_df = encode_categorical_features_test(imputed_test_categorical_df, encoders, encoding_categories)
encoded_test_categorical_df = encoded_test_categorical_df.reindex(columns=encoded_categorical_df.columns, fill_value=0)

### Model try

In [10]:
X = pd.concat([transformed_numeric_df, encoded_categorical_df], axis=1)
X_test = pd.concat([transformed_test_numeric_df, encoded_test_categorical_df], axis=1)

In [11]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model = XGBClassifier(eval_metric='logloss', random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

f1_scores = []
accuracies = []

for train_index, val_index in strat_kfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model.fit(X_train_resampled, y_train_resampled)
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)
    
    print("Fold Results:")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_val, y_val_pred))

print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))

Fold Results:
Accuracy: 0.976
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       870
           1       0.88      0.94      0.91       130

    accuracy                           0.98      1000
   macro avg       0.94      0.96      0.95      1000
weighted avg       0.98      0.98      0.98      1000

Fold Results:
Accuracy: 0.971
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       870
           1       0.89      0.88      0.89       130

    accuracy                           0.97      1000
   macro avg       0.94      0.93      0.94      1000
weighted avg       0.97      0.97      0.97      1000

Fold Results:
Accuracy: 0.975
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       870
           1       0.96      0.85      0.90       130

    accuracy        

### Test Data Preprocessing

In [12]:
y_test_pred = model.predict(X_test)

In [13]:
# Assuming y_test_pred is your predictions and X_test has an 'ID' column
submission_df = pd.DataFrame({
    'index': X_test.index,  # or replace 'ID' with the column name if using specific column
    'y': y_test_pred
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)