# TASK 2 - TRAIN MODEL
### Decision Tree Regression Model
### Load The Dataset

In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('./PJ2.csv')

### Select The Features & Target Variable

In [12]:
X = df[['helpful_votes', 'total_votes', 'verified_purchase', 'product_category']]
Y = df['star_rating']

### Encode the categorical feature

In [13]:
enc = OneHotEncoder()
X_encoded = enc.fit_transform(X[['product_category']])


### Concatenate Encoded Feature With The Rest

In [14]:
X_final = pd.concat([X[['helpful_votes', 'total_votes', 'verified_purchase']], pd.DataFrame(X_encoded.toarray())], axis=1)

### Split Dataset Into Training & Testing

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_final, Y, test_size=0.2, random_state=42)

### Create & Train the Decision Tree Regression Model

In [16]:
# #convert categorical column to numerical
label_encoder = LabelEncoder()
X_train['verified_purchase'] = label_encoder.fit_transform(X_train['verified_purchase'])
X_test['verified_purchase'] = label_encoder.transform(X_test['verified_purchase'])


# handle missing values in Y_train
imputer_y = SimpleImputer(strategy='mean')
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()


# convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

DTR = DecisionTreeRegressor(random_state=42)
DTR.fit(X_train, y_train)


### Making Predictions

In [17]:
y_pred = DTR.predict(X_test)

### Evaluate The Model Before Optimization

In [18]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Before Optimization:\nMean Squared Error: ", mse, "\nR2 Score: ", r2)

Before Optimization:
Mean Squared Error:  0.9583448027093491 
R2 Score:  0.20536853664056198


### Optimize with Grid Search

- What I used to optimize
    - During the optimization phase, the algorithm I used is a Random Forest.
    - It combines multiple decision trees to create a more strong and accurate model.
    
- How the algorithm works
    - Random Forest builds multiple decision trees during the training phase.
    - Each tree is trained on a random subset of the data and a random subset of the features.
    - The reason it is random is to reduce overfitting.

- Random feature selection
    - At each node of the decision tree a random subset of features is used.
    - This helps to remove any correlatons the tree make the model more robust.

- Voting
    - During prediction each tree in the Random Forest independently predicts the target variable.
    - For regression, the final prediction is typically the average of all the tree predictions.

- Hyperparameters
    - The performance of the model depends on the following hyperparameters:
        - Number of trees (n_estimators)
        - Max depth of the trees (max_depth)
        - Min # of samples required to split a node (min_samples_split)
        - Min # of samples required at each leaf (min_samples_leaf)


`TL;DR: Random Forest builds multiple trees where each tree gets a random subset of the training data and a random subset of features. It then collects predictions to make a final prediction. Random Forest helps to reduct overfitting and improve performance.` 

In [19]:
# define the hyperparameters grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the RandomForestRegressor model
model = RandomForestRegressor()

# perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# evaluate the best model
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("After Optimization:\nMean Squared Error:", mse, "\nR2 Score:", r2, "\nBest Parameters:", grid_search.best_params_)

After Optimization:
Mean Squared Error: 0.9460725387218426 
R2 Score: 0.2155443387772833 
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


# TASK 3 - Test And Evaluate


### Mathematical equations
##### Mean Squared Error for Decision Tree Regressor & Random Forest Regressor:
`MSE = (1/n) * Σ(y_true - y_pred)^2`
- n = number of samples
- y_true = true target values
- y_pred = the predicted values



##### R-squared for Decision Tree Regressor & Random Forest Regressor:
`R2 = 1 - (Σ(y_true - y_pred)^2) / Σ(y_true - ȳ)^2`
- where ȳ is the mean of the true target values.

In [20]:
# evaluation metrics for DTR 
y_dtr_pred = DTR.predict(X_test)
mse_dtr = mean_squared_error(y_test, y_dtr_pred)
r2_dtr = r2_score(y_test, y_dtr_pred)

# evaluation metrics for Random Forest 
y_pred_rf = grid_search.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)


print("Decision Tree Regressor:")
print("Mean Squared Error:", mse_dtr)
print("R2 Score:", r2_dtr)
print("\nRandomForest Regressor:")

print("Mean Squared Error:", mse_rf)
print("R2 Score:", r2_rf)

Decision Tree Regressor:
Mean Squared Error: 0.9583448027093491
R2 Score: 0.20536853664056198

RandomForest Regressor:
Mean Squared Error: 0.9460725387218426
R2 Score: 0.2155443387772833
