In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectFromModel

# Load the training and test data
train_data = pd.read_csv('Train-set.csv')
test_data = pd.read_csv('Test-set.csv')

# Separate the 'Target' column (labels) from the training data
y_train = train_data['Target']
train_data.drop('Target', axis=1, inplace=True)

# Combine train and test data for preprocessing
all_data = pd.concat([train_data, test_data], axis=0)

# Feature Engineering: Extract day of the week and create a weekend indicator
try:
    # Convert 'day' column to datetime
    all_data['day'] = pd.to_datetime(all_data['day'])
    # Extract day of the week (0-6) and create 'day_of_week' feature
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    # Create binary indicator for weekend (Saturday and Sunday)
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    # Drop the original 'day' column
    all_data.drop('day', axis=1, inplace=True)
except (ValueError, OverflowError, pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
    # Handle errors due to invalid date formats
    all_data['day'] = pd.to_datetime(all_data['day'], errors='coerce')
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    all_data.drop('day', axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
categorical_cols = all_data.select_dtypes(include=[object]).columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocess the data using the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_all_preprocessed = preprocessor.fit_transform(all_data)

# Get the names of the columns after one-hot encoding
encoded_columns = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols)
all_columns = list(numeric_cols) + list(encoded_columns)

# Convert the preprocessed data array to a DataFrame with proper column names
cleaned_data = pd.DataFrame(X_all_preprocessed, columns=all_columns)

# Save the preprocessed and feature-engineered data to a CSV file
cleaned_data.to_csv('cleaned_data.csv', index=False)

# Handle class imbalance using BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_all_preprocessed[:train_data.shape[0]], y_train)

# Create and train optimized models
optimized_rf_model = RandomForestClassifier(n_estimators=150, max_depth=9, random_state=42)
optimized_rf_model.fit(X_train_resampled, y_train_resampled)


  all_data['day'] = pd.to_datetime(all_data['day'])
  all_data['day'] = pd.to_datetime(all_data['day'], errors='coerce')


ValueError: Shape of passed values is (78161, 1), indices imply (78161, 71)