In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import re # Import regular expressions module

# --- Problem A: Optimizing Group Fitness Class Utilization (Classification) ---
# --- USES data from ./data/problem2.csv BASED ON PROVIDED COLUMN NAMES ---

print("--- Starting Problem A: Class Attendance Prediction (using ./data/problem2.csv) ---")

# 1. Load Data for Problem A (Classification)
data_path_A = './data/problem2.csv' # Updated path
try:
    df_problem_A = pd.read_csv(data_path_A)
    print(f"Problem A data ({data_path_A}) loaded successfully.")
except FileNotFoundError:
    print(f"Error: {data_path_A} not found. Please ensure the 'data' folder exists and contains the file.")
    exit()

# ****** START OF CLEANING STEP ******
# Clean the 'days_before' column: Extract number from string like 'X days'
def extract_days(value):
    if isinstance(value, str):
        match = re.search(r'\d+', value) # Find one or more digits
        if match:
            return int(match.group(0))
    elif pd.isna(value):
         return np.nan # Keep NaN as NaN for imputer
    elif isinstance(value, (int, float)):
         return int(value) # Keep existing numbers as numbers
    return np.nan # Return NaN if format is unexpected

print("Cleaning 'days_before' column...")
try:
    df_problem_A['days_before'] = df_problem_A['days_before'].apply(extract_days)
    df_problem_A['days_before'] = pd.to_numeric(df_problem_A['days_before'], errors='coerce')
    print("'days_before' column cleaned and converted to numeric.")
except Exception as e:
    print(f"Error cleaning 'days_before': {e}. Please check the column format.")
    exit()
# ****** END OF CLEANING STEP ******

# 2. Define Features (X) and Target (y) for Problem A
y_class = df_problem_A['attended']
X_class = df_problem_A.drop(['attended', 'booking_id'], axis=1)

# 3. Identify Feature Types for Problem A
numerical_features_class = ['months_as_member', 'weight', 'days_before']
categorical_features_class = ['day_of_week', 'time', 'category']

# 4. Preprocessing Steps for Problem A
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_class = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_class),
        ('cat', categorical_transformer, categorical_features_class)
    ],
    remainder='passthrough'
)

# 5. Split Data into Training and Testing Sets for Problem A
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)
print(f"Data split complete for Problem A: Train set size={len(X_train_class)}, Test set size={len(X_test_class)}")

# 6. Define and Train Models for Problem A (using Pipelines)
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor_class),
                              ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))]) # Increased max_iter

pipeline_rf_class = Pipeline(steps=[('preprocessor', preprocessor_class),
                                   ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))])

print("\nTraining Logistic Regression (Problem A)...")
pipeline_lr.fit(X_train_class, y_train_class)
print("Training Random Forest Classifier (Problem A)...")
pipeline_rf_class.fit(X_train_class, y_train_class)
print("Model training complete for Problem A.")

# 7. Evaluate Models for Problem A
print("\n--- Evaluating Classification Models (Problem A) ---")

# Logistic Regression Evaluation
y_pred_lr = pipeline_lr.predict(X_test_class)
y_prob_lr = pipeline_lr.predict_proba(X_test_class)[:, 1]
print("\nLogistic Regression Performance (Problem A):")
print(f"Accuracy: {accuracy_score(y_test_class, y_pred_lr):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test_class, y_prob_lr):.4f}")
print("Classification Report:\n", classification_report(y_test_class, y_pred_lr))

# Random Forest Classifier Evaluation
y_pred_rf_class = pipeline_rf_class.predict(X_test_class)
y_prob_rf_class = pipeline_rf_class.predict_proba(X_test_class)[:, 1]
print("\nRandom Forest Classifier Performance (Problem A):")
print(f"Accuracy: {accuracy_score(y_test_class, y_pred_rf_class):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test_class, y_prob_rf_class):.4f}")
print("Classification Report:\n", classification_report(y_test_class, y_pred_rf_class))


# --- Problem B: Optimizing Gym Equipment Utilization (Regression) ---
# --- USES data from ./data/problem1.csv BASED ON PROVIDED COLUMN NAMES ---

print("\n\n--- Starting Problem B: Gym Crowdedness Prediction (using ./data/problem1.csv) ---")

# 1. Load Data for Problem B (Regression)
data_path_B = './data/problem1.csv' # Updated path
try:
    df_problem_B = pd.read_csv(data_path_B)
    print(f"Problem B data ({data_path_B}) loaded successfully.")
    # Convert 'date' column to datetime objects
    df_problem_B['date'] = pd.to_datetime(df_problem_B['date'])
except FileNotFoundError:
    print(f"Error: {data_path_B} not found. Please ensure the 'data' folder exists and contains the file.")
    exit()

# 2. Define Features (X) and Target (y) for Problem B
y_reg = df_problem_B['number_people']
X_reg = df_problem_B.drop(['number_people', 'date'], axis=1)

# 3. Identify Feature Types for Problem B
numerical_features_reg = ['timestamp', 'temperature', 'month', 'hour']
categorical_features_reg = ['day_of_week', 'is_weekend', 'is_holiday', 'is_start_of_semester', 'is_during_semester']

# 4. Preprocessing Steps for Problem B
numerical_transformer_reg = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer_reg = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_reg, numerical_features_reg),
        ('cat', categorical_transformer_reg, categorical_features_reg)
    ],
    remainder='passthrough'
)

# 5. Split Data for Problem B
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
print(f"Data split complete for Problem B: Train set size={len(X_train_reg)}, Test set size={len(X_test_reg)}")

# 6. Define and Train Models for Problem B

# Model 1: Linear Regression (Baseline)
pipeline_linr = Pipeline(steps=[('preprocessor', preprocessor_reg),
                                ('regressor', LinearRegression())])

# Model 2: Random Forest Regressor
pipeline_rf_reg = Pipeline(steps=[('preprocessor', preprocessor_reg),
                                  ('regressor', RandomForestRegressor(random_state=42, n_estimators=100))])

print("\nTraining Linear Regression (Problem B)...")
pipeline_linr.fit(X_train_reg, y_train_reg)
print("Training Random Forest Regressor (Problem B)...")
pipeline_rf_reg.fit(X_train_reg, y_train_reg)
print("Model training complete for Problem B.")

# 7. Evaluate Models for Problem B
print("\n--- Evaluating Regression Models (Problem B) ---")

# Linear Regression Evaluation
y_pred_linr = pipeline_linr.predict(X_test_reg)
rmse_linr = np.sqrt(mean_squared_error(y_test_reg, y_pred_linr))
r2_linr = r2_score(y_test_reg, y_pred_linr)
print("\nLinear Regression Performance (Problem B):")
print(f"RMSE (Root Mean Squared Error): {rmse_linr:.4f}")
print(f"R^2 Score: {r2_linr:.4f}")

# Random Forest Regressor Evaluation
y_pred_rf_reg = pipeline_rf_reg.predict(X_test_reg)
rmse_rf_reg = np.sqrt(mean_squared_error(y_test_reg, y_pred_rf_reg))
r2_rf_reg = r2_score(y_test_reg, y_pred_rf_reg)
print("\nRandom Forest Regressor Performance (Problem B):")
print(f"RMSE (Root Mean Squared Error): {rmse_rf_reg:.4f}")
print(f"R^2 Score: {r2_rf_reg:.4f}")

# 8. Feature Importance (Example for Random Forest Regressor - Problem B)
print("\n--- Feature Importance Analysis (Problem B - Random Forest Regressor) ---")
preprocessor_fitted_reg = pipeline_rf_reg.named_steps['preprocessor']
rf_model_reg = pipeline_rf_reg.named_steps['regressor']

try:
    ohe_feature_names = preprocessor_fitted_reg.named_steps['cat']\
        .named_steps['onehot'].get_feature_names_out(categorical_features_reg)
    all_feature_names = numerical_features_reg + ohe_feature_names.tolist()
    importances = rf_model_reg.feature_importances_

    if len(all_feature_names) == len(importances):
        feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        print("Top 10 Features influencing gym crowdedness prediction (Problem B):")
        print(feature_importance_df.head(10))
    else:
        print(f"Mismatch in feature names ({len(all_feature_names)}) and importances ({len(importances)}). Cannot display feature importance mapping.")
        print("Raw Importances:", importances)
except Exception as e:
    print(f"Could not extract feature importances automatically: {e}")
    print("Feature importances are available in rf_model_reg.feature_importances_ but matching names requires care.")

print("\n\n--- Analysis Complete ---")
print("Next steps would involve deeper EDA, feature engineering, hyperparameter tuning, model selection, and crucial ethical considerations review.")

--- Starting Problem A: Class Attendance Prediction (using ./data/problem2.csv) ---
Problem A data (./data/problem2.csv) loaded successfully.
Cleaning 'days_before' column...
'days_before' column cleaned and converted to numeric.
Data split complete for Problem A: Train set size=1200, Test set size=300

Training Logistic Regression (Problem A)...
Training Random Forest Classifier (Problem A)...
Model training complete for Problem A.

--- Evaluating Classification Models (Problem A) ---

Logistic Regression Performance (Problem A):
Accuracy: 0.7733
ROC AUC: 0.8288
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83       209
           1       0.61      0.73      0.66        91

    accuracy                           0.77       300
   macro avg       0.74      0.76      0.75       300
weighted avg       0.79      0.77      0.78       300


Random Forest Classifier Performance (Problem A):
Accuracy: 0.7533
ROC AUC: 0.

  df_problem_B['date'] = pd.to_datetime(df_problem_B['date'])


Data split complete for Problem B: Train set size=49747, Test set size=12437

Training Linear Regression (Problem B)...
Training Random Forest Regressor (Problem B)...
Model training complete for Problem B.

--- Evaluating Regression Models (Problem B) ---

Linear Regression Performance (Problem B):
RMSE (Root Mean Squared Error): 15.8244
R^2 Score: 0.5155

Random Forest Regressor Performance (Problem B):
RMSE (Root Mean Squared Error): 6.4871
R^2 Score: 0.9186

--- Feature Importance Analysis (Problem B - Random Forest Regressor) ---
Could not extract feature importances automatically: 'ColumnTransformer' object has no attribute 'named_steps'
Feature importances are available in rf_model_reg.feature_importances_ but matching names requires care.


--- Analysis Complete ---
Next steps would involve deeper EDA, feature engineering, hyperparameter tuning, model selection, and crucial ethical considerations review.
