In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, classification_report
import numpy as np

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

path = 'Ready_to_train.csv'

# Load Concrete Compressive Strength dataset
data = pd.read_csv(path)

# Drop samples with missing values
data.dropna(inplace=True)

# Get names of indexes for which column Age has value 30
indexNames = data[data['difference_in_days'] == -1].index
data.drop(indexNames, inplace=True)

data.loc[data.difference_in_days == 0, 'difference_in_days'] = 1
data.loc[data.difference_in_days == 2, 'difference_in_days'] = 1
data.loc[data.difference_in_days == 3, 'difference_in_days'] = 1
data.loc[data.difference_in_days > 3, 'difference_in_days'] = 0

# Separate features and target variable
X = data.drop('difference_in_days', axis=1)
y = data['difference_in_days']

xxx = data['difference_in_days'].nunique()
print(f"Unique values: {xxx}")
print(data)

# Initialize RandomForestRegressor
rf_regressor = RandomForestRegressor()

# Define 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define RMSE as the scoring metric
scorer = make_scorer(root_mean_squared_error)

# Store RMSE values for each number of estimators
rmse_values = []

# Perform Cross Validation with different number of estimators
for n_estimators in np.arange(1, 101, 5):
    rf_regressor.set_params(n_estimators=n_estimators)
    scores = cross_val_score(rf_regressor, X, y, cv=kf, scoring=scorer)
    rmse = np.mean(scores)
    rmse_values.append(rmse)
    print(f"Number of Estimators: {n_estimators}, Average RMSE: {rmse:.3f}")

# Find the optimal number of estimators with the minimum RMSE
optimal_estimators = np.argmin(rmse_values) * 5 + 1
print(f"Optimal number of estimators: {optimal_estimators}, Minimum RMSE: {rmse_values[optimal_estimators // 5]:.3f}")

# Initialize the final model with the optimal number of estimators
rf_regressor.set_params(n_estimators=optimal_estimators)

# Perform cross-validation for model evaluation
y_true_all = []
y_pred_all = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    y_true_all.extend(y_test)
    y_pred_all.extend([np.int64(element) for element in y_pred])

# Print classification report
print(classification_report(y_true_all, y_pred_all))

# Display feature importance
feature_importance = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print("Feature Importance:")
print(feature_importance_df.round(3))
