In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r"C:\Users\matti\OneDrive\Thesis\Data\AGGREGATED.csv", parse_dates=["date"])

print(df.info())

#drop unnecessary columns for Random Forest, Unamed: 0.1 and date
df.drop(columns=["Unnamed: 0", "date"], inplace=True)

In [None]:
print(df.info())
"""
Now let"s separate the features and the target variable, so y(close) is target and everything else is features.
"""
df["target"] = df["close"].shift(-1) #shift -1 to get the next day closing price as target

# Drop the last row (since target will be NaN)
df = df.dropna(subset=["target"])  #drop the target from the dataframe, so we can use it as target variable and also keep the current day"s closing price as a feature

df.reset_index(drop=True, inplace=True)

# Separate features and target
X = df.drop(columns=["target"])
y = df["target"]

print(X.head())
print(y.head())

In [None]:
print(X.shape)
print(y.shape)
print()
print(X.head())
print(y.head())
print()
print(X.tail())
print(y.tail())

In [None]:
#Save to CSV files so I can reuse them in other files without having to run this code again
X.to_csv("X.csv", index=False)
y.to_csv("y.csv", index=False)

In [None]:
#train Random Forest Regressor

rf = RandomForestRegressor(n_estimators=200, max_depth=6, random_state=42, n_jobs=-1) #n_jobs to use all cores
rf.fit(X, y)

rf_importances = pd.Series(rf.feature_importances_, index=X.columns)
rf_importances_sorted = rf_importances.sort_values()


In [None]:
#plot
plt.figure(figsize=(12, 10))
rf_importances_sorted.plot(kind="barh")
plt.title("Random Forest Feature Importances (All Features)")
plt.xlabel("Importance Score")
plt.tight_layout()
#plt.savefig("rf_feature_importance_all.png", dpi=300, bbox_inches="tight")  # Save for thesis
plt.show()
plt.savefig("RF.png", dpi=300, bbox_inches="tight")  # You can change filename and format



In [None]:
print(rf_importances_sorted)