# Tabular-Only Approach

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

In [12]:
# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop rows with missing performance_score (target)
train_df = train_df.dropna(subset=['performance_score'])

# Define features and target
X = train_df.drop(columns=['performance_score'])
y = train_df['performance_score']

In [13]:
# Identify column types
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Model training function
def train_model(model, X, y):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    print(f'Model: {model.__class__.__name__}, RMSE: {np.sqrt(-scores.mean())}')
    return pipeline.fit(X, y)

# Train models
rf_model = train_model(RandomForestRegressor(n_estimators=100, random_state=42), X, y)
# xgb_model = train_model(xgb.XGBRegressor(n_estimators=100, random_state=42), X, y)
# lgb_model = train_model(lgb.LGBMRegressor(n_estimators=100, random_state=42), X, y)

# Predict on test data
def predict_and_save(model, test_df, filename):
    predictions = model.predict(test_df)
    submission = pd.DataFrame({'id': test_df['id'], 'performance_score': predictions})
    submission.to_csv(filename, index=False)

Model: RandomForestRegressor, RMSE: 3.4807728142083576


In [None]:
predict_and_save(rf_model, test_df, 'submission_rf.csv')
# predict_and_save(xgb_model, test_df, 'submission_xgb.csv')
# predict_and_save(lgb_model, test_df, 'submission_lgb.csv')