In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

path = 'ready_to_train_dataset (2).csv'

# Load dataset
data = pd.read_csv(path)

# Drop samples with missing values
data.dropna(inplace=True)

# Separate features and target variable
X = data[['rate', 'review_length', 'readability_score', 'review_subjectivity',
          'review_verbs_count', 'review_sentiment', 'Topic1', 'Topic6',
          'review_adj_count', 'review_adv_count', 'sum_of_tfidf',
          'information giving', 'other']]
y = data['response']

# Initialize RandomForestRegressor
rf_regressor = RandomForestRegressor()

# Define 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define RMSE as the scoring metric
scorer = make_scorer(root_mean_squared_error)

# Store RMSE values for each number of estimators
rmse_values = []

# Perform Cross Validation with different number of estimators
for n_estimators in np.arange(1, 101, 5):
    rf_regressor.set_params(n_estimators=n_estimators)
    scores = cross_val_score(rf_regressor, X, y, cv=kf, scoring=scorer)
    rmse = np.mean(scores)
    rmse_values.append(rmse)
    print(f"Number of Estimators: {n_estimators}, Average RMSE: {rmse:.3f}")

# Find the optimal number of estimators with the minimum RMSE
optimal_estimators = np.argmin(rmse_values) * 5 + 1
print(f"Optimal number of estimators: {optimal_estimators}, Minimum RMSE: {rmse_values[optimal_estimators // 5]:.3f}")

# Train the final model with the optimal number of estimators
rf_regressor.set_params(n_estimators=optimal_estimators)
rf_regressor.fit(X, y)

# Use cross_val_predict for generating predictions for evaluation
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(rf_regressor, X, y, cv=kf)

# Convert predictions to int64 for classification report
y_pred = [np.int64(element) for element in y_pred]

from sklearn.metrics import classification_report
print(classification_report(y, y_pred))

# Display feature importance
feature_importance = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print("Feature Importance:")
print(feature_importance_df.round(3))
