# Final Project: Stock Price Prediction for SP 500 Companies

## Overview
This project focuses on predicting stock prices for companies listed in the S&P 500 index using historical financial data. The goal is to develop a robust predictive model that helps investors make informed decisions, reduce risks, and maximize returns.

## Objectives
- **Business Understanding:** Define the problem scope and success metrics.
- **Data Analysis:** Explore and preprocess the dataset.
- **Modeling:** Train and evaluate machine learning models.
- **Deployment:** Plan for model deployment.
- **Presentation:** Create executive-level slides and a video presentation.

## Business Understanding

- **Problem Statement:** Accurate stock price prediction can help investors make informed decisions, reduce risks, and maximize returns.
- **Success Metrics:**
  - RMSE (Root Mean Squared Error)
  - MAE (Mean Absolute Error)
  - R-squared


## Data Processing Pipeline

### 1. Data Collection
- Loaded the S&P 500 dataset containing 505 companies and 14 financial features.
- Validated the data integrity.

### 2. Preprocessing
- **Handle Missing Values**: Used forward fill and dropped remaining missing values.
- **Encode Categorical Variables**: Converted categorical features like `Sector` into dummy variables.
- **Normalize Features**: Applied MinMaxScaler to scale numerical features.
- **Data Cleaning**: Removed any inconsistencies or errors.

### 3. Feature Engineering
- Added moving averages and volatility metrics to capture trends and variability.
- Computed technical indicators to enhance predictive power.

In [None]:
# Import necessary libraries
import pandas as pd
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Upload the CSV file from your local machine
uploaded = files.upload()

# Load the uploaded CSV file into a DataFrame
file_name = list(uploaded.keys())[0]
if file_name in uploaded:
    data = pd.read_csv(file_name)
    print("Dataset loaded successfully!")
else:
    print("File not found. Please ensure the file name is correct.")
    data = None

# Only proceed with data processing if data is not None
if data is not None:
    # Display the first few rows of the dataset
    print(data.head())

    # Get a summary of the dataset
    print(data.info())

    # Print the column names to identify the correct column name
    print("\nColumn names in the dataset:")
    print(data.columns.tolist())

    # Check for missing values
    missing_values = data.isnull().sum()
    print("\nMissing values in each column:")
    print(missing_values)

    # Check for outliers using boxplots
    plt.figure(figsize=(14, 8))
    sns.boxplot(data=data.select_dtypes(include=[np.number]))
    plt.title('Boxplot of Numerical Features')
    plt.xticks(rotation=90)
    plt.show()

    # Visualize feature distributions
    plt.figure(figsize=(14, 8))
    try:
        # Replace 'Close' with the actual column name for closing prices from the output of data.columns.tolist()
        sns.histplot(data['Price'], bins=30, kde=True)
        plt.title('Distribution of Prices')
        plt.xlabel('Price')
        plt.ylabel('Frequency')
        plt.show()
    except KeyError:
        print("Error: 'Price' column not found. Please check the column names printed above and update the code.")

    # Visualize correlations
    plt.figure(figsize=(14, 10))
    correlation_matrix = data.corr(numeric_only=True)  # Add numeric_only=True to avoid UserWarning
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Matrix')
    plt.show()
else:
    print("Data was not loaded, skipping further processing.")

In [None]:
# Handle missing values
data.ffill(inplace=True)
data.dropna(inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, drop_first=True)

# Normalize/scale features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_features = data.select_dtypes(include=[np.number]).columns
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Engineer new features
data['Moving_Avg_5'] = data['Price'].rolling(window=5).mean()
data['Volatility'] = data['Price'].rolling(window=5).std()

# Drop NaN values created by rolling windows
data.dropna(inplace=True)

# Display the first few rows of the processed dataset
print(data.head())

In [None]:
# Split data into features and target
X = data.drop('Price', axis=1)
y = data['Price']

# Split data into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Random Forest model
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Evaluate the model on the validation set
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred_rf = best_rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
mae_rf = mean_absolute_error(y_val, y_pred_rf)
r2_rf = r2_score(y_val, y_pred_rf)

print(f'Random Forest - Validation Set RMSE: {rmse_rf:.2f},MAE: {mae_rf:.2f}, R-squared: {r2_rf:.2f}')

In [None]:
# Train XGBoost model
import xgboost as xgb

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_reg.fit(X_train, y_train)

# Hyperparameter tuning using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
random_search = RandomizedSearchCV(estimator=xgb_reg, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)
best_xgb = random_search.best_estimator_

# Evaluate the model on the validation set
y_pred_xgb = best_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
mae_xgb = mean_absolute_error(y_val, y_pred_xgb)
r2_xgb = r2_score(y_val, y_pred_xgb)

print(f'XGBoost - Validation Set RMSE: {rmse_xgb:.2f}, MAE: {mae_xgb:.2f}, R-squared: {r2_xgb:.2f}')

# Train Decision Tree model
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_dt = dt.predict(X_val)
rmse_dt = np.sqrt(mean_squared_error(y_val, y_pred_dt))
mae_dt = mean_absolute_error(y_val, y_pred_dt)
r2_dt = r2_score(y_val, y_pred_dt)

print(f'Decision Tree - Validation Set RMSE: {rmse_dt:.2f}, MAE: {mae_dt:.2f}, R-squared: {r2_dt:.2f}')

# Train Linear Regression model
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_lr = lr.predict(X_val)
rmse_lr = np.sqrt(mean_squared_error(y_val, y_pred_lr))
mae_lr = mean_absolute_error(y_val, y_pred_lr)
r2_lr = r2_score(y_val, y_pred_lr)

print(f'Linear Regression - Validation Set RMSE: {rmse_lr:.2f}, MAE: {mae_lr:.2f}, R-squared: {r2_lr:.2f}')

In [None]:
# Prepare data
X = data.drop(columns=['Price'])
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
# Evaluate the best model on the test set
y_pred_test = best_rf.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f'Random Forest - Test Set Metrics:')
print(f'RMSE: {rmse_test:.2f}, MAE: {mae_test:.2f}, R-squared: {r2_test:.2f}')

In [None]:
# Perform K-Means clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Assume 'data' is your preprocessed dataset
X = data.drop('Price', axis=1)

# Elbow Method
wcss = []
sil_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    sil_scores.append(silhouette_score(X, kmeans.labels_))

# Plot Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.show()

# Plot Silhouette Scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), sil_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.show()

# Apply PCA for visualization
from sklearn.decomposition import PCA
import seaborn as sns

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis')
plt.title('PCA Projection of Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

## Deployment Planning

### Deployment Approach
- **Batch Processing:** Periodic predictions (e.g., daily).
- **Latency:** Ensured predictions are generated within acceptable timeframes.
- **Cost Optimization:** Optimized resource usage to minimize costs.
- **Hosting:** Deployed using AWS SageMaker.

### Implementation Details
- **Batch Processing:** Schedule predictions using AWS Lambda or cron jobs.
- **Latency:** Use serverless architectures for low-latency responses.
- **Cost Optimization:** Monitor resource usage and scale resources dynamically.
- **Hosting:** Deploy the model as a REST API using AWS SageMaker endpoints.

In [None]:
# Residual Analysis
# Predictions from the best model
y_pred = best_rf.predict(X_val)

# Calculate residuals
residuals = y_val - y_pred

# Plot residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

## Model Training

### Models Compared
- **Linear Regression**: Simple baseline model for comparison.
- **Decision Tree**: Captures non-linear patterns but prone to overfitting.
- **Random Forest**: Ensemble of decision trees with improved generalization.
- **XGBoost**: Gradient boosting framework with advanced regularization.

### Hyperparameter Tuning
- Used GridSearchCV for Random Forest and RandomizedSearchCV for XGBoost to optimize parameters.

In [None]:
# Random Forest Actual vs. Predicted
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming y_val (actual values) and y_pred_rf, y_pred_xgb, y_pred_dt, y_pred_lr (predicted values) are available

# Random Forest Actual vs. Predicted
plt.figure(figsize=(12, 6))
sns.scatterplot(x=y_val, y=y_pred_rf, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', linestyle='--', lw=2)
plt.title('Random Forest: Actual vs. Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.grid(True)
plt.show()

# XGBoost Actual vs. Predicted
plt.figure(figsize=(12, 6))
sns.scatterplot(x=y_val, y=y_pred_xgb, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', linestyle='--', lw=2)
plt.title('XGBoost: Actual vs. Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.grid(True)
plt.show()

# Decision Tree Actual vs. Predicted
plt.figure(figsize=(12, 6))
sns.scatterplot(x=y_val, y=y_pred_dt, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', linestyle='--', lw=2)
plt.title('Decision Tree: Actual vs. Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.grid(True)
plt.show()

# Linear Regression Actual vs. Predicted
plt.figure(figsize=(12, 6))
sns.scatterplot(x=y_val, y=y_pred_lr, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', linestyle='--', lw=2)
plt.title('Linear Regression: Actual vs. Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming rmse_rf, mae_rf, r2_rf, etc., are available from your model evaluations

metrics_data = {
    'Model': ['Random Forest', 'XGBoost', 'Decision Tree', 'Linear Regression'],
    'RMSE': [rmse_rf, rmse_xgb, rmse_dt, rmse_lr],
    'MAE': [mae_rf, mae_xgb, mae_dt, mae_lr],
    'R-squared': [r2_rf, r2_xgb, r2_dt, r2_lr]
}

metrics_df = pd.DataFrame(metrics_data)

# Plotting RMSE
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='RMSE', data=metrics_df, palette='viridis')
plt.title('Comparison of RMSE Across Models')
plt.ylabel('RMSE')
plt.show()

# Plotting MAE
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='MAE', data=metrics_df, palette='viridis')
plt.title('Comparison of MAE Across Models')
plt.ylabel('MAE')
plt.show()

# Plotting R-squared
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='R-squared', data=metrics_df, palette='viridis')
plt.title('Comparison of R-squared Across Models')

# Plotting R-squared
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='R-squared', data=metrics_df, palette='viridis')
plt.title('Comparison of R-squared Across Models')
plt.ylabel('R-squared')
plt.show()
plt.show()

In [None]:
# Learning Curves
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    best_rf, X_train, y_train, cv=3, scoring='neg_mean_squared_error'
)

train_scores_mean = -train_scores.mean(axis=1)
val_scores_mean = -val_scores.mean(axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, label='Training Error')
plt.plot(train_sizes, val_scores_mean, label='Validation Error')
plt.title('Learning Curves')
plt.xlabel('Training Examples')
plt.ylabel('MSE')
plt.legend()
plt.show()

## Results and Insights

### Model Performance Comparison
| Model           | RMSE    | MAE     | R²      |
|-----------------|---------|---------|---------|
| Random Forest   | 0.01    | 0.01    | 0.98    |
| Decision Tree   | 0.02    | 0.01    | 0.93    |
| Linear Regression | 0.02   | 0.01    | 0.84    |
| XGBoost         | 0.07    | 0.01    | -0.30   |

- **Best Model**: Random Forest achieved the highest R² (0.98) with the lowest RMSE and MAE.
- **XGBoost**: Performed poorly with a negative R², indicating it didn’t generalize well on this dataset.

In [None]:
# Feature Importance
importance = best_rf.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=features)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

## Conclusion and Recommendations

### Summary of Findings
- Developed a robust Random Forest model for predicting S&P 500 stock prices.
- Achieved exceptional performance with R² = 0.98 and minimal error rates.
- Validated the model on a comprehensive S&P 500 dataset.

### Recommendations
- Deploy the Random Forest model for investment decision support.
- Implement continuous monitoring and retraining to adapt to market changes.
- Integrate the model with existing portfolio management systems.
- Establish risk management protocols based on model predictions.

### Future Work
- Explore additional features such as sentiment analysis, news data, and economic indicators.
- Investigate advanced models like LSTM or Transformer architectures.
- Integrate real-time data feeds for dynamic predictions.
- Develop sector-specific models for enhanced accuracy.
- Implement automated trading strategies based on predictions.

In [None]:
def colab2pdf():
  # @title Download Notebook in PDF Format{display-mode:'form'}
  !apt-get install -yqq --no-install-recommends librsvg2-bin>/dev/null;
  import contextlib,datetime,google,io,IPython,ipywidgets,json,locale,nbformat,os,pathlib,requests,urllib,warnings,werkzeug,yaml,re;locale.setlocale(locale.LC_ALL,'en_US.UTF-8');warnings.filterwarnings('ignore',category=nbformat.validator.MissingIDFieldWarning);
  %matplotlib inline
  def convert(b):
    try:
      s.value='🔄 Converting';b.disabled=True
      n=pathlib.Path(werkzeug.utils.secure_filename(urllib.parse.unquote(requests.get(f'http://{os.environ["COLAB_JUPYTER_IP"]}:{os.environ["KMP_TARGET_PORT"]}/api/sessions').json()[0]['name'])))
      p=pathlib.Path('/content/pdfs')/f'{datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")}_{n.stem}';p.mkdir(parents=True,exist_ok=True);nb=nbformat.reads(json.dumps(google.colab._message.blocking_request('get_ipynb',timeout_sec=600)['ipynb']),as_version=4)
      u=[u for c in nb.cells if c.get('cell_type')=='markdown' for u in re.findall(r'!\[.*?\]\((https?://.*?)\)',c['source']) if requests.head(u,timeout=5).status_code!=200]
      if u:raise Exception(f"Bad Image URLs: {','.join(u)}")
      nb.cells=[cell for cell in nb.cells if '--Colab2PDF' not in cell.source]
      nb=nbformat.v4.new_notebook(cells=nb.cells or [nbformat.v4.new_code_cell('#')]);nbformat.validator.normalize(nb)
      nbformat.write(nb,(p/f'{n.stem}.ipynb').open('w',encoding='utf-8'))
      with (p/'config.yml').open('w', encoding='utf-8') as f: yaml.dump({'include-in-header':[{'text':r'\usepackage{fvextra}\DefineVerbatimEnvironment{Highlighting}{Verbatim}{breaksymbolleft={},showspaces=false,showtabs=false,breaklines,breakanywhere,commandchars=\\\{\}}'}],'include-before-body':[{'text':r'\DefineVerbatimEnvironment{verbatim}{Verbatim}{breaksymbolleft={},showspaces=false,showtabs=false,breaklines}'}]},f)
      !quarto render {p}/{n.stem}.ipynb --metadata-file={p}/config.yml --to pdf -M latex-auto-install -M margin-top=1in -M margin-bottom=1in -M margin-left=1in -M margin-right=1in --quiet
      google.colab.files.download(str(p/f'{n.stem}.pdf'));s.value=f'✅ Downloaded: {n.stem}.pdf'
    except Exception as e:s.value=f'❌ {str(e)}'
    finally:b.disabled=False
  if not pathlib.Path('/usr/local/bin/quarto').exists():
    !wget -q 'https://quarto.org/download/latest/quarto-linux-amd64.deb' && dpkg -i quarto-linux-amd64.deb>/dev/null && quarto install tinytex --update-path --quiet && rm quarto-linux-amd64.deb
  b=ipywidgets.widgets.Button(description='⬇️ Download');s=ipywidgets.widgets.Label();b.on_click(lambda b:convert(b));IPython.display.display(ipywidgets.widgets.HBox([b,s]))
colab2pdf() # | Colab2PDF v1.6 | https://github.com/drengskapur/colab2pdf | GPL-3.0-or-later |