In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load cleaned data
file_path = "alpha_vantage_IBM_2025-01-26.parquet"
df = pd.read_parquet(file_path)

print("Available columns in the dataframe:", df.columns.tolist())

# Ensure numeric conversion of required columns
numeric_columns = ['1. open', '2. high', '3. low', '4. close', '5. volume', 'SMA']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Handle missing values
df.fillna(method='ffill', inplace=True)

# Feature Engineering
df['price_change'] = df['4. close'].diff()
df['rolling_avg_50'] = df['4. close'].rolling(window=50).mean()
df['rolling_avg_200'] = df['4. close'].rolling(window=200).mean()
df.dropna(inplace=True)

# Prepare features and target variable
X = df[['1. open', '2. high', '3. low', '5. volume', 'SMA', 'price_change', 'rolling_avg_50', 'rolling_avg_200']]
y = df['4. close']

# Split the dataset
if len(X) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
else:
    raise ValueError("Insufficient data for training. Please check the preprocessing step.")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# Visualization of actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual Prices", color="blue")
plt.plot(y_pred, label="Predicted Prices", color="red")
plt.title("Stock Price Prediction")
plt.xlabel("Sample Index")
plt.ylabel("Price")
plt.legend()
plt.grid()
plt.show()
