In [24]:
# Import necessary libraries
import pandas as pd
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import sklearn  # To get scikit-learn version

In [25]:
# Step 1: Load the dataset
file_path = "dataset.csv"
data = pd.read_csv(file_path)

In [26]:
# Step 2: Define Features (X) and Target (y)
# Use column indices for dynamic feature extraction
feature_indices = [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13]  # Indices of features
X = data.iloc[:, feature_indices]  # Input features
y = data["Probation Chance (Out of 100%)"]  # Target variable

In [27]:
# Step 3: Split data into training and testing sets
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [28]:
# Step 4: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
# Step 5: Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=random_state, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)

In [30]:
# Step 6: Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:\nMean Squared Error: {mse:.2f}\nR2 Score: {r2:.2f}")

Model Performance:
Mean Squared Error: 28.29
R2 Score: 0.97


In [31]:
# Step 7: Save the model, scaler, and metadata
joblib_file_model = "probation_rf_model.pkl"
joblib_file_scaler = "scaler.pkl"
metadata_file = "metadata.json"

# Save model and scaler
joblib.dump(rf_model, joblib_file_model)
joblib.dump(scaler, joblib_file_scaler)

# Store metadata
metadata = {
    "scikit-learn_version": sklearn.__version__,  # Automatically fetch the version
    "feature_indices": feature_indices,  # Store feature indices instead of names
    "train_test_split_ratio": test_size,  # Test size used in splitting
    "random_state": random_state,  # Random seed for reproducibility
    "model_type": "RandomForestRegressor",
    "n_estimators": 100,
    "mean_squared_error": mse,
    "r2_score": r2
}

# Save metadata as JSON
with open(metadata_file, "w") as f:
    json.dump(metadata, f, indent=4)  # Pretty-print for readability

print(f"Model saved to {joblib_file_model}")
print(f"Scaler saved to {joblib_file_scaler}")
print(f"Metadata saved to {metadata_file}")

Model saved to probation_rf_model.pkl
Scaler saved to scaler.pkl
Metadata saved to metadata.json
