In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
#from sklearn.metrics import root_mean_squared_error
import numpy as np

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

path = 'Ready_to_train.csv'

# Load Concrete Compressive Strength dataset
data = pd.read_csv(path)

In [None]:
# Drop samples with missing values
data.dropna(inplace=True)

# Get names of indexes for which column Age has value 30
indexNames = data[ data['difference_in_days'] == -1].index
# print(indexNames)
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)


data.loc[data.difference_in_days==0, 'difference_in_days'] = 1
data.loc[data.difference_in_days==2, 'difference_in_days'] = 1
data.loc[data.difference_in_days==3, 'difference_in_days'] = 1
data.loc[data.difference_in_days > 3, 'difference_in_days'] = 0

# Separate features and target variable
X = data.drop('difference_in_days', axis=1)
y = data['difference_in_days']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)



In [None]:
xxx = data['difference_in_days'].nunique()
print(f"Unique values: {xxx}")
print(data)


In [None]:

# Initialize RandomForestRegressor
rf_regressor = RandomForestRegressor()

# Define 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define RMSE as the scoring metric
scorer = make_scorer(root_mean_squared_error)

# Store RMSE values for each number of estimators
rmse_values = []

# Perform Cross Validation with different number of estimators
for n_estimators in np.arange(1, 101, 5):
    rf_regressor.set_params(n_estimators=n_estimators)
    scores = cross_val_score(rf_regressor, X_train, y_train, cv=kf, scoring=scorer)
    rmse = np.mean(scores)
    rmse_values.append(rmse)
    print(f"Number of Estimators: {n_estimators}, Average RMSE: {rmse:.3f}")


Number of Estimators: 1, Average RMSE: 0.373
Number of Estimators: 6, Average RMSE: 0.299
Number of Estimators: 11, Average RMSE: 0.291
Number of Estimators: 16, Average RMSE: 0.288
Number of Estimators: 21, Average RMSE: 0.286
Number of Estimators: 26, Average RMSE: 0.285
Number of Estimators: 31, Average RMSE: 0.285
Number of Estimators: 36, Average RMSE: 0.285
Number of Estimators: 41, Average RMSE: 0.284
Number of Estimators: 46, Average RMSE: 0.284
Number of Estimators: 51, Average RMSE: 0.284
Number of Estimators: 56, Average RMSE: 0.283
Number of Estimators: 61, Average RMSE: 0.283
Number of Estimators: 66, Average RMSE: 0.283
Number of Estimators: 71, Average RMSE: 0.283
Number of Estimators: 76, Average RMSE: 0.283
Number of Estimators: 81, Average RMSE: 0.283
Number of Estimators: 86, Average RMSE: 0.283
Number of Estimators: 91, Average RMSE: 0.283
Number of Estimators: 96, Average RMSE: 0.283


In [None]:
# Find the optimal number of estimators with the minimum RMSE
optimal_estimators = np.argmin(rmse_values) + 1
print(f"Optimal number of estimators: {optimal_estimators}, Minimum RMSE: {rmse_values[optimal_estimators - 1]:.3f}")

# Train the final model with the optimal number of estimators
rf_regressor.set_params(n_estimators=optimal_estimators)
rf_regressor.fit(X_train, y_train)


Optimal number of estimators: 18, Minimum RMSE: 0.283


In [None]:
y_pred = rf_regressor.predict(X_test)

y_pred = [np.int64(element) for element in y_pred]



from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Display feature importance
feature_importance = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print("Feature Importance:")
print(feature_importance_df.round(3))


              precision    recall  f1-score   support

           0       0.09      0.85      0.16      4881
           1       0.93      0.20      0.33     51932

    accuracy                           0.26     56813
   macro avg       0.51      0.53      0.25     56813
weighted avg       0.86      0.26      0.32     56813

Feature Importance:
                                         Feature  Importance
1                                  review_length       0.124
5                                review_polarity       0.079
4                            review_subjectivity       0.075
2                              readability_score       0.065
6                             review_nouns_count       0.059
3                              review_complexity       0.052
17                                        Topic2       0.047
16                                        Topic1       0.044
23                                        Topic8       0.042
15                                        T