In [1]:
# Install necessary packages
!pip install xgboost

# Standard library imports
import hashlib

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer, StandardScaler
from xgboost import XGBClassifier

# Local module imports
from clean_data_helper import *
from feature_summary import *
from plot_histograms import *
from preprocessing_helper import *
from numeric_imputer_helper import *
from categorical_imputer_helper import *
from transformer_helper import *
from encoder_helper import *
from rounding_helper import *



In [2]:
data = pd.read_csv('final_proj_data.csv')
test_data = pd.read_csv('final_proj_test.csv')

print(data.shape)
print(test_data.shape)

(10000, 231)
(2500, 230)


### Rough Cleaning

In [3]:
cleaned_data = remove_columns_with_missing_values(data, 0.18)
cleaned_test_data = remove_columns_with_missing_values(test_data, 0.18)

y = cleaned_data['y']
cleaned_data = cleaned_data.drop(columns=['y'])

print(cleaned_data.shape)
print(cleaned_test_data.shape)

New dataset shape after column removal: (10000, 67)
New dataset shape after column removal: (2500, 66)
(10000, 66)
(2500, 66)


In [4]:
numeric_df, categorical_df = split_dataset(cleaned_data)
print('numeric_df.shape: ', numeric_df.shape)
print('categorical_df.shape: ', categorical_df.shape)

numeric_test_df, categorical_test_df = split_dataset(cleaned_test_data)
print('numeric_test_df.shape: ', numeric_test_df.shape)
print('categorical_test_df.shape: ', categorical_test_df.shape)

numeric_df.shape:  (10000, 38)
categorical_df.shape:  (10000, 28)
numeric_test_df.shape:  (2500, 38)
categorical_test_df.shape:  (2500, 28)


### Numerical Features Preprocessing

In [5]:
feature_categories = categorize_features(numeric_df)
imputed_numeric_df = impute_data(numeric_df, feature_categories)

imputed_test_numeric_df = impute_data(numeric_test_df, feature_categories)

print(imputed_numeric_df.shape)
print(imputed_test_numeric_df.shape)

(10000, 38)
(2500, 38)


In [6]:
outliners_cleaned_df = remove_outlier_features(imputed_numeric_df, outlier_percentage_threshold=10, iqr_threshold=3)
outliners_test_cleaned_df = remove_outlier_features(imputed_test_numeric_df, outlier_percentage_threshold=10, iqr_threshold=3)

DataFrame before removing features:
(10000, 38)

Features with more than 10% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var113           1013               10.13
Var132           1632               16.32

DataFrame after removing features with more than 10% outliers:
(10000, 36)
DataFrame before removing features:
(2500, 38)

Features with more than 10% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var113            252               10.08
Var132            388               15.52

DataFrame after removing features with more than 10% outliers:
(2500, 36)


In [7]:
rounding_features = find_rounding_features(outliners_cleaned_df)
rounding_test_features = find_rounding_features(outliners_test_cleaned_df)

numeric_rounded_df = round_detected_features(outliners_cleaned_df, rounding_features)
numericTest_rounded_df = round_detected_features(outliners_test_cleaned_df, rounding_test_features)

In [8]:
categories = categorize_numeric_features_by_skewness_and_variance(numeric_rounded_df,
                                                                  skew_threshold=7,
                                                                  unique_value_threshold=7,
                                                                  variance_threshold=0.01)
transformed_numeric_df, transformers = transform_numeric_features(numeric_rounded_df, categories)
transformed_test_numeric_df = transform_numeric_features_test(numericTest_rounded_df, transformers, categories)

### Categorical Features Preprocessing

In [9]:
categories = categorize_categorical_features(categorical_df, low_threshold=10, medium_threshold=100)
imputed_categorical_df, imputers = impute_categorical_data(categorical_df, categories)
imputed_test_categorical_df = impute_categorical_data_test(categorical_test_df, categories, imputers)

In [10]:
encoding_categories = categorize_categorical_features_for_encoding(imputed_categorical_df, low_cardinality_threshold=10,
                                                                   high_cardinality_threshold=100)
encoded_categorical_df, encoders = encode_categorical_features(imputed_categorical_df, encoding_categories, target=y)
encoded_test_categorical_df = encode_categorical_features_test(imputed_test_categorical_df, encoders, encoding_categories)
encoded_test_categorical_df = encoded_test_categorical_df.reindex(columns=encoded_categorical_df.columns, fill_value=0)

### Model try

In [11]:
X = pd.concat([transformed_numeric_df, encoded_categorical_df], axis=1)
X_test = pd.concat([transformed_test_numeric_df, encoded_test_categorical_df], axis=1)
X_combined = pd.concat([X, X_test], axis=0).reset_index(drop=True)

In [12]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model = XGBClassifier(eval_metric='logloss', random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

f1_scores = []
accuracies = []

for train_index, val_index in strat_kfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model.fit(X_train_resampled, y_train_resampled)
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)

print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))


F1 Scores for each fold: [0.950605511995344, 0.941764621559633, 0.9397428409859925, 0.9432690964949029, 0.931917211328976, 0.940897180795524, 0.9377199933854338, 0.9450228789403794, 0.9321412075800459, 0.9465997383387179]
Average F1 Score: 0.9409680281404948
Accuracies for each fold: [0.977, 0.974, 0.974, 0.974, 0.968, 0.973, 0.971, 0.974, 0.969, 0.976]
Average Accuracy: 0.9730000000000001


In [13]:
y_test_pred = model.predict(X_test)

In [14]:
y_combined = np.concatenate([y, y_test_pred])

print("X_combined shape:", X_combined.shape)
print("y_combined shape:", y_combined.shape)

X_combined shape: (12500, 91)
y_combined shape: (12500,)


In [15]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model2 = XGBClassifier(eval_metric='logloss', random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

f1_scores = []
accuracies = []

for train_index, val_index in strat_kfold.split(X_combined, y_combined):
    X_train, X_val = X_combined.iloc[train_index], X_combined.iloc[val_index]
    y_train, y_val = y_combined[train_index], y_combined[val_index]
    
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model2.fit(X_train_resampled, y_train_resampled)
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)

print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))


F1 Scores for each fold: [0.9943017691867029, 0.9943017691867029, 0.9905574717592865, 0.9943667557470357, 0.9925103058191928, 0.9981222519156785, 0.9981114943518573, 1.0, 0.9981222519156785, 0.9943667557470357]
Average F1 Score: 0.9954760825629171
Accuracies for each fold: [0.9976, 0.9976, 0.996, 0.9976, 0.9968, 0.9992, 0.9992, 1.0, 0.9992, 0.9976]
Average Accuracy: 0.9980800000000001


In [16]:
y_common_pred = model2.predict(X_test)

In [17]:
# Assuming y_test_pred is your predictions and X_test has an 'ID' column
submission_df = pd.DataFrame({
    'index': X_test.index,  # or replace 'ID' with the column name if using specific column
    'y': y_common_pred
})


# Save to CSV
submission_df.to_csv("submission.csv", index=False)