In [1]:
import pandas as pd
import numpy as np

In [2]:
# Exploratory Data Analysis (EDA)

df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(4209, 378)

In [5]:
df.sort_values(by="y", ascending=False).head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
883,1770,265.32,y,r,ai,f,d,ag,l,t,...,0,0,0,0,0,0,0,0,0,0
342,681,169.91,aa,l,ak,f,d,i,c,d,...,0,0,0,0,0,0,0,0,0,0
1459,2903,167.45,ai,b,ae,a,d,ac,g,m,...,0,0,1,0,0,0,0,0,0,0
3133,6273,165.52,aj,v,r,c,d,q,g,a,...,0,0,1,0,0,0,0,0,0,0
1203,2396,160.87,j,o,as,f,d,ab,g,p,...,1,0,0,0,0,0,0,0,0,0


In [23]:
# Model 1- Random Forest 

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV

In [7]:
# Preprocess the data

# Extract features and target variable from the training set
X = df.drop(columns=['y'])
y = df["y"]

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns

# Identify quantative columns in the dataset
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Split the data into training and testing sets- 80% is train, and 20% is test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create transformers for numerical and categorical columns

# numerical- replace NA's with mean and scale the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# categorical- replace NA's with the most frequent value and ignore any new value in the 
# test data that has not appeared in the training set.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Apply preprocessing steps to your data
preprocessed_data = preprocessor.fit_transform(X)


In [21]:
# Initialize RF model
rf_model = RandomForestRegressor(random_state=499)

# Create a pipeline with the preprocessor and RandomForestRegressor model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Fit the model to the training data
model.fit(X_train, y_train)

# Evaluate the RF model's performance
RMSE_train = np.sqrt(MSE(y_train, y_train_pred))
RMSE_test = np.sqrt(MSE(y_test, y_test_pred))

print(f"Training R^2 score: {RMSE_train}")
print(f"Testing R^2 score: {RMSE_test}")

# Currently the model is overfit, will need to tune the hyperparameters

Training R^2 score: 3.3864122229035187
Testing R^2 score: 8.707194823432323


In [33]:
# using Cross-Validation to tune the hyperparameters of RF model

param_grid = {
    'regressor__n_estimators': [1, 10, 50, 75, 100],
    'regressor__max_depth': [None, 2, 5, 7, 10],
    'regressor__min_samples_split': [2, 5, 10, 12, 16],
    'regressor__min_samples_leaf': [1, 2, 3, 5, 8]
}

# Create a pipeline with the preprocessor and Random Forest model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Instantiate GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(model,  # Use the entire pipeline here
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='neg_mean_squared_error', 
                           n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score found
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'regressor__max_depth': 5, 'regressor__min_samples_leaf': 3, 'regressor__min_samples_split': 16, 'regressor__n_estimators': 75}


In [34]:
best_model_RF = grid_search.best_estimator_

# Tuned RF model's accuracy on train and test
y_train_pred = best_model_RF.predict(X_train)
y_test_pred = best_model_RF.predict(X_test)

RMSE_train = np.sqrt(MSE(y_train, y_train_pred))
RMSE_test = np.sqrt(MSE(y_test, y_test_pred))

print(f"Training R^2 score: {RMSE_train}")
print(f"Testing R^2 score: {RMSE_test}")


Training R^2 score: 8.037151664831006
Testing R^2 score: 8.105639541486896


In [None]:
# Principal Component Analysis (PCA) to reduce dimensionality

