In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:

# Load the dataset
dataset_path = 'dataset.csv'
df = pd.read_csv(dataset_path)

In [None]:
# Handling Missing Values
print("\nHandling Missing Values:")
print("Before handling missing values:")
print(df.isnull().sum())

In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
X = df.drop('performance_metric', axis=1)
y = df['performance_metric']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Model Evaluation
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')

In [None]:
# Feature Importance Analysis
feature_importances = model.feature_importances_
feature_names = X.columns
sorted_idx = np.argsort(feature_importances)[::-1]

In [None]:
# Plotting feature importance
plt.bar(range(X.shape[1]), feature_importances[sorted_idx], align="center")
plt.xticks(range(X.shape[1]), feature_names[sorted_idx], rotation=45)
plt.xlabel('Feature')
plt.ylabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.show()

In [None]:
# Predictions for New Data (Replace 'new_data.csv' with your new data file)
new_data_path = 'ew_data.csv'
new_data = pd.read_csv(new_data_path)

new_predictions = model.predict(new_data)
print('Predictions for New Data:')
print(new_predictions)