In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-fundamentals-and-applications-2024-10-01/final_proj_data.csv
/kaggle/input/ml-fundamentals-and-applications-2024-10-01/final_proj_test.csv
/kaggle/input/ml-fundamentals-and-applications-2024-10-01/final_proj_sample_submission.csv


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# Importing necessary libraries for data processing and model building
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Defining paths to the Kaggle input files
train_data_path = '/kaggle/input/ml-fundamentals-and-applications-2024-10-01/final_proj_data.csv'
test_data_path = '/kaggle/input/ml-fundamentals-and-applications-2024-10-01/final_proj_test.csv'

# Loading the data
try:
    train_data = pd.read_csv(train_data_path)
    print("Training data loaded successfully.")
except FileNotFoundError:
    print("Training data not found.")

try:
    test_data = pd.read_csv(test_data_path)
    print("Test data loaded successfully.")
except FileNotFoundError:
    print("Test data not found.")

# Removing columns with all missing values
if 'train_data' in locals():
    train_data = train_data.dropna(axis=1, how='all')
    print(f"Training data shape after removing columns with all missing values: {train_data.shape}")

# Splitting the data into training and validation sets
if 'train_data' in locals():
    X = train_data.drop(columns=['y'], errors='ignore')
    y = train_data['y'] if 'y' in train_data.columns else None

    # Check for the presence of the target variable
    if y is None:
        print("Target variable 'y' not found in the data.")
    else:
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing the data
if 'y' in locals() and y is not None:
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Defining individual models for ensemble with improved parameters
    rf_model = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42, class_weight='balanced', min_samples_split=5)
    gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
    xgb_model = xgb.XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.03, random_state=42, scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss')
    lgb_model = lgb.LGBMClassifier(n_estimators=500, max_depth=10, learning_rate=0.03, random_state=42, class_weight='balanced', num_leaves=50)

    # Creating an ensemble model using VotingClassifier
    ensemble_model = ImbPipeline(steps=[('preprocessor', preprocessor),
                                        ('smote', SMOTE(random_state=42, sampling_strategy=0.75)),
                                        ('classifier', VotingClassifier(estimators=[
                                            ('rf', rf_model),
                                            ('gb', gb_model),
                                            ('xgb', xgb_model),
                                            ('lgb', lgb_model)],
                                            voting='soft'))])

    # Training the ensemble model
    ensemble_model.fit(X_train, y_train)

    # Making predictions and evaluating the model
    y_pred = ensemble_model.predict(X_val)
    y_pred_proba = ensemble_model.predict_proba(X_val)[:, 1]
    print("Classification Report:")
    print(classification_report(y_val, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    print("Accuracy Score:", accuracy_score(y_val, y_pred))
    print("Balanced Accuracy Score:", balanced_accuracy_score(y_val, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_val, y_pred_proba))

    # Conducting cross-validation for better evaluation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=skf, scoring='balanced_accuracy')
    print("Cross-Validation Balanced Accuracy Scores:", cv_scores)
    print("Mean CV Balanced Accuracy Score:", np.mean(cv_scores))

    # Ensuring that the test dataset has exactly 2500 rows
    assert len(test_data) == 2500, "The number of rows in the test dataset must be 2500."

    # Preprocessing the test dataset
    X_test_preprocessed = preprocessor.transform(test_data)

    # Making predictions using the improved ensemble model
    predictions = ensemble_model.named_steps['classifier'].predict(X_test_preprocessed)

    # Creating the submission DataFrame in the correct format
    submission = pd.DataFrame({
        'index': test_data.index,
        'y': predictions
    })

    # Ensuring that the submission file has exactly 2500 rows
    assert len(submission) == 2500, "The submission file must contain exactly 2500 rows."

    # Saving the submission file
    submission.to_csv('/kaggle/working/submission.csv', index=False)  # Correct path for Kaggle submission


Training data loaded successfully.
Test data loaded successfully.
Training data shape after removing columns with all missing values: (10000, 213)
[LightGBM] [Info] Number of positive: 5217, number of negative: 6956
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47385
[LightGBM] [Info] Number of data points in the train set: 12173, number of used features: 2005
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1739
           1       0.70      0.78      0.74       261

    accuracy                           0.93      2000
   macro avg       0.83      0.86      0.85      2000