In [1]:


import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# Load the transformed data from CSV
pivoted_df = pd.read_csv('transformed_data.csv')

# Drop rows where target ('close_volume') is NaN
pivoted_df = pivoted_df.dropna(subset=['close_volume'])

# Extract the features (seconds_bucket columns) and target ('close_volume')
X = pivoted_df.drop(columns=['symbol_id', 'date_id', 'close_volume'])  # Features
y = pivoted_df['close_volume']  # Target

# Handle missing values in features using mean imputation
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest model on the training data
random_forest.fit(X_train, y_train)

# Save the trained model
joblib.dump(random_forest, 'pretrained_model.pkl')

# Print message
print("Model saved as pretrained_model.pkl.")
