<a href="https://www.kaggle.com/code/harsimransinghdalal/byte-me?scriptVersionId=185381292" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import optuna
import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/train.csv')
test_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/sample_submission.csv')

# Display the first few rows of each dataframe to understand their structure
print("Train DataFrame Head:")
print(train_df.head())

print("\nTest DataFrame Head:")
print(test_df.head())

print("\nSample Submission DataFrame Head:")
print(sample_submission_df.head())


In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from scipy.stats import randint

# Custom transformer for outlier treatment
class OutlierTreatment(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.Q1 = X.quantile(0.25)
            self.Q3 = X.quantile(0.75)
        else:  # Assuming it's a numpy array
            self.Q1 = np.quantile(X, 0.25, axis=0)
            self.Q3 = np.quantile(X, 0.75, axis=0)
        self.IQR = self.Q3 - self.Q1
        return self

    def transform(self, X):
        X_out = X.copy()
        lower_bound = self.Q1 - self.factor * self.IQR
        upper_bound = self.Q3 + self.factor * self.IQR
        X_out = np.clip(X_out, lower_bound, upper_bound)
        return X_out

# Load datasets
train_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/train.csv')
test_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024/sample_submission.csv')

# Identify features and target
features = train_df.drop(columns=['id', 'Status'])
target = train_df['Status']

# Split categorical and numerical features
numerical_features = features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = features.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier', OutlierTreatment(factor=1.5)),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the preprocessing and training pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the hyperparameters to tune
param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__max_depth': [10, 20, 30, 40, 50, None],
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 20),
    'preprocessor__num__outlier__factor': [1.0, 1.5, 2.0]
}

# Perform randomized search
n_iter_search = 10  # Reduced number of iterations
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, scoring='neg_log_loss', n_jobs=-1)

random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Fit the model with the best hyperparameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Validate the model
y_valid_pred = best_model.predict_proba(X_valid)

# Calculate log loss on the validation set
logloss = log_loss(pd.get_dummies(y_valid), y_valid_pred)
print(f"Validation Log Loss with Best Model: {logloss}")

# Predict on the test data
test_pred = best_model.predict_proba(test_df.drop(columns=['id']))

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Status_C': [proba[0] for proba in test_pred],
    'Status_CL': [proba[1] for proba in test_pred],
    'Status_D': [proba[2] for proba in test_pred]
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)
submission_df.head()