In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
#from sklearn.metrics import root_mean_squared_error
import numpy as np

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

path = 'ready_to_train_dataset (2).csv'

# Load Concrete Compressive Strength dataset
data = pd.read_csv(path)


In [None]:
# Drop samples with missing values
data.dropna(inplace=True)

# Separate features and target variable
X = data[['rate','review_length','readability_score','review_subjectivity','review_verbs_count','review_sentiment','Topic1','Topic6','review_adj_count','review_adv_count','sum_of_tfidf','information giving','other']]
y = data['response']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)



In [None]:
# Initialize RandomForestRegressor
rf_regressor = RandomForestRegressor()

# Define 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define RMSE as the scoring metric
scorer = make_scorer(root_mean_squared_error)

# Store RMSE values for each number of estimators
rmse_values = []

# Perform Cross Validation with different number of estimators
for n_estimators in np.arange(1, 101, 5):
    rf_regressor.set_params(n_estimators=n_estimators)
    scores = cross_val_score(rf_regressor, X_train, y_train, cv=kf, scoring=scorer)
    rmse = np.mean(scores)
    rmse_values.append(rmse)
    print(f"Number of Estimators: {n_estimators}, Average RMSE: {rmse:.3f}")


Number of Estimators: 1, Average RMSE: 0.549
Number of Estimators: 6, Average RMSE: 0.426
Number of Estimators: 11, Average RMSE: 0.412
Number of Estimators: 16, Average RMSE: 0.407
Number of Estimators: 21, Average RMSE: 0.405
Number of Estimators: 26, Average RMSE: 0.403
Number of Estimators: 31, Average RMSE: 0.402
Number of Estimators: 36, Average RMSE: 0.401
Number of Estimators: 41, Average RMSE: 0.400
Number of Estimators: 46, Average RMSE: 0.400
Number of Estimators: 51, Average RMSE: 0.399
Number of Estimators: 56, Average RMSE: 0.399
Number of Estimators: 61, Average RMSE: 0.399
Number of Estimators: 66, Average RMSE: 0.399
Number of Estimators: 71, Average RMSE: 0.398
Number of Estimators: 76, Average RMSE: 0.398
Number of Estimators: 81, Average RMSE: 0.398
Number of Estimators: 86, Average RMSE: 0.398
Number of Estimators: 91, Average RMSE: 0.398
Number of Estimators: 96, Average RMSE: 0.398


In [None]:
# Find the optimal number of estimators with the minimum RMSE
optimal_estimators = np.argmin(rmse_values) + 1
print(f"Optimal number of estimators: {optimal_estimators}, Minimum RMSE: {rmse_values[optimal_estimators - 1]:.3f}")

# Train the final model with the optimal number of estimators
rf_regressor.set_params(n_estimators=optimal_estimators)
rf_regressor.fit(X_train, y_train)


In [None]:
y_pred = rf_regressor.predict(X_test)

y_pred = [np.int64(element) for element in y_pred]



from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Display feature importance
feature_importance = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print("Feature Importance:")
print(feature_importance_df.round(3))


              precision    recall  f1-score   support

           0       0.62      1.00      0.77      9466
           1       0.89      0.02      0.04      5815

    accuracy                           0.63     15281
   macro avg       0.76      0.51      0.40     15281
weighted avg       0.73      0.63      0.49     15281

Feature Importance:
                Feature  Importance
0                  rate       0.330
10         sum_of_tfidf       0.159
1         review_length       0.099
3   review_subjectivity       0.091
2     readability_score       0.084
6                Topic1       0.052
7                Topic6       0.039
5      review_sentiment       0.037
8      review_adj_count       0.034
4    review_verbs_count       0.032
9      review_adv_count       0.029
12                other       0.007
11   information giving       0.006
