In [1]:
!pip install xgboost

import hashlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer, StandardScaler
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

from clean_data_helper import *
from feature_summary import *
from plot_histograms import *
from preprocessing_helper import *
from numeric_imputer_helper import *
from categorical_imputer_helper import *
from transformer_helper import *
from encoder_helper import *
from rounding_helper import *
from corellation_based_reducer import *



In [2]:
data = pd.read_csv('final_proj_data.csv')
test_data = pd.read_csv('final_proj_test.csv')

print(data.shape)
print(test_data.shape)

(10000, 231)
(2500, 230)


### Rough Cleaning

In [3]:
cleaned_data = remove_columns_with_missing_values(data, 0.18)
cleaned_test_data = remove_columns_with_missing_values(test_data, 0.18)

y = cleaned_data['y']
cleaned_data = cleaned_data.drop(columns=['y'])

print(cleaned_data.shape)
print(cleaned_test_data.shape)

New dataset shape after column removal: (10000, 67)
New dataset shape after column removal: (2500, 66)
(10000, 66)
(2500, 66)


In [4]:
numeric_df, categorical_df = split_dataset(cleaned_data)
print('numeric_df.shape: ', numeric_df.shape)
print('categorical_df.shape: ', categorical_df.shape)

numeric_test_df, categorical_test_df = split_dataset(cleaned_test_data)
print('numeric_test_df.shape: ', numeric_test_df.shape)
print('categorical_test_df.shape: ', categorical_test_df.shape)

numeric_df.shape:  (10000, 38)
categorical_df.shape:  (10000, 28)
numeric_test_df.shape:  (2500, 38)
categorical_test_df.shape:  (2500, 28)


### Numerical Features Preprocessing

In [5]:
feature_categories = categorize_features(numeric_df)
imputed_numeric_df = impute_data(numeric_df, feature_categories)

imputed_test_numeric_df = impute_data(numeric_test_df, feature_categories)

print(imputed_numeric_df.shape)
print(imputed_test_numeric_df.shape)

(10000, 38)
(2500, 38)


In [6]:
combined_df_outliners = pd.concat([imputed_numeric_df, imputed_test_numeric_df], axis=0)
outliners_cleaned_combined_df = remove_outlier_features(combined_df_outliners, outlier_percentage_threshold=6, iqr_threshold=3)

outliners_cleaned_df = outliners_cleaned_combined_df.iloc[:len(imputed_numeric_df), :].reset_index(drop=True)
outliners_test_cleaned_df = outliners_cleaned_combined_df.iloc[len(imputed_numeric_df):, :].reset_index(drop=True)

DataFrame before removing features:
(12500, 38)

Features with more than 6% outliers and their outlier percentages:
        Outlier Count  Outlier Percentage
Var35             948               7.584
Var78             910               7.280
Var113           1259              10.072
Var132           2020              16.160

DataFrame after removing features with more than 6% outliers:
(12500, 34)


In [7]:
rounding_features = find_rounding_features(outliners_cleaned_df)
rounding_test_features = find_rounding_features(outliners_test_cleaned_df)

numeric_rounded_df = round_detected_features(outliners_cleaned_df, rounding_features)
numericTest_rounded_df = round_detected_features(outliners_test_cleaned_df, rounding_test_features)

In [8]:
combined_df = pd.concat([numeric_rounded_df, numericTest_rounded_df], axis=0)
reduced_combined_df, dropped_columns_combined = correlation_based_elimination(combined_df, threshold=0.9)

reduced_train_df = reduced_combined_df.iloc[:len(numeric_rounded_df), :].reset_index(drop=True)
reduced_test_df = reduced_combined_df.iloc[len(numeric_rounded_df):, :].reset_index(drop=True)

In [9]:
categories = categorize_numeric_features_by_skewness_and_variance(reduced_train_df,
                                                                  skew_threshold=7,
                                                                  unique_value_threshold=7,
                                                                  variance_threshold=0.01)
transformed_numeric_df, transformers = transform_numeric_features(reduced_train_df, categories)
transformed_test_numeric_df = transform_numeric_features_test(reduced_test_df, transformers, categories)

### Categorical Features Preprocessing

In [10]:
categories = categorize_categorical_features(categorical_df, low_threshold=10, medium_threshold=100)
imputed_categorical_df, imputers = impute_categorical_data(categorical_df, categories)
imputed_test_categorical_df = impute_categorical_data_test(categorical_test_df, categories, imputers)

In [11]:
encoding_categories = categorize_categorical_features_for_encoding(imputed_categorical_df, low_cardinality_threshold=10,
                                                                   high_cardinality_threshold=100)
encoded_categorical_df, encoders = encode_categorical_features(imputed_categorical_df, encoding_categories, target=y)
encoded_test_categorical_df = encode_categorical_features_test(imputed_test_categorical_df, encoders, encoding_categories)
encoded_test_categorical_df = encoded_test_categorical_df.reindex(columns=encoded_categorical_df.columns, fill_value=0)

### Model try

In [12]:
X = pd.concat([transformed_numeric_df, encoded_categorical_df], axis=1)
X_test = pd.concat([transformed_test_numeric_df, encoded_test_categorical_df], axis=1)
X_combined = pd.concat([X, X_test], axis=0).reset_index(drop=True)

In [13]:
best_params = {
    'subsample': 1.0,
    'reg_lambda': 1,
    'reg_alpha': 0.01,
    'n_estimators': 200,
    'max_depth': 7,
    'learning_rate': 0.1,
    'gamma': 0,
    'colsample_bytree': 0.6
}

model = XGBClassifier(eval_metric='logloss', random_state=42, **best_params)

strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')
f1_scores = []
accuracies = []
feature_importance_dict = {}

for train_index, val_index in strat_kfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model.fit(X_train_resampled, y_train_resampled)
    
    importances = model.feature_importances_
    for i, feature in enumerate(X.columns):
        if feature not in feature_importance_dict:
            feature_importance_dict[feature] = []
        feature_importance_dict[feature].append(importances[i])
    
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)

print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))

avg_importances = {feature: np.mean(importances) for feature, importances in feature_importance_dict.items()}
importance_threshold = 0.008
selected_features = [feature for feature, importance in avg_importances.items() if importance >= importance_threshold]

print("\nSelected Features Based on Importance Threshold:", selected_features)


F1 Scores for each fold: [0.9523029025851826, 0.939322433844598, 0.9443780070639931, 0.9452751674032629, 0.9439867636413775, 0.940897180795524, 0.9496531540110019, 0.9492518882526579, 0.9412756239464956, 0.9441903243881585]
Average F1 Score: 0.9450533445932251
Accuracies for each fold: [0.978, 0.973, 0.976, 0.975, 0.974, 0.973, 0.977, 0.976, 0.973, 0.975]
Average Accuracy: 0.975

Selected Features Based on Importance Threshold: ['Var7', 'Var24', 'Var44', 'Var65', 'Var73', 'Var74', 'Var143', 'Var144', 'Var173', 'Var193', 'Var198', 'Var199', 'Var202', 'Var207', 'Var211', 'Var212', 'Var216', 'Var217', 'Var218', 'Var220', 'Var222', 'Var205_09_Q', 'Var205_sJzTlal', 'Var221_Al6ZaUT', 'Var221_oslk', 'Var221_zCkv', 'Var227_6fzt']


In [14]:
X_train_selected = X[selected_features]
X_test_selected = X_test[selected_features]

model.fit(X_train_selected, y)
y_test_pred = model.predict(X_test_selected)

In [15]:
X_combined = pd.concat([X[selected_features], X_test[selected_features]], axis=0)
y_combined = np.concatenate([y, y_test_pred])

In [16]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model2 = XGBClassifier(eval_metric='logloss', random_state=42, **best_params)
f1_scores = []
accuracies = []

for train_index, val_index in strat_kfold.split(X_combined, y_combined):
    X_train, X_val = X_combined.iloc[train_index], X_combined.iloc[val_index]
    y_train, y_val = y_combined[train_index], y_combined[val_index]
    
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model2.fit(X_train_resampled, y_train_resampled)
    
    y_val_pred = model2.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)
    
    f1 = f1_score(y_val, y_val_pred, average='macro')
    f1_scores.append(f1)

print("\nF1 Scores for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))
print("Accuracies for each fold:", accuracies)
print("Average Accuracy:", np.mean(accuracies))

y_common_pred = model.predict(X_test[selected_features])


F1 Scores for each fold: [0.9309160162814145, 0.9216709384662176, 0.9466622777340916, 0.911129016968433, 0.9530562978919649, 0.9230123285428224, 0.9329722773339053, 0.9300338783325969, 0.9478673268152598, 0.941405334458351]
Average F1 Score: 0.9338725692825056
Accuracies for each fold: [0.9704, 0.9672, 0.9776, 0.9616, 0.98, 0.9672, 0.9712, 0.9696, 0.9776, 0.9744]
Average Accuracy: 0.9716799999999999


In [17]:
submission_df = pd.DataFrame({
    'index': X_test.index,
    'y': y_common_pred
})

submission_df.to_csv("submission.csv", index=False)