<a href="https://colab.research.google.com/github/Itiel-z/BIA-Projects/blob/main/Project_3/Project_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import os

In [6]:
# Data collection
try:
  if not os.path.exists("merged_p2.csv"):
    raise FileNotFoundError("CSV file not found in current directory.")
  else:
    df = pd.read_csv("merged_p2.csv", encoding='utf-8')
  if df.empty:
    raise ValueError("The loaded file is empty.")
  print("Successfully loaded the dataset!")
except Exception as e:
  print(f"Error loading data: {str(e)}")
  raise

Successfully loaded the dataset!


In [7]:
# Display the first few rows
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,decade,ma_7,ma_20,...,sector_CONSUMER DURABLES,sector_CONSUMER NON-DURABLES,sector_CONSUMER SERVICES,sector_ENERGY,sector_FINANCE,sector_HEALTH CARE,sector_MISCELLANEOUS,sector_PUBLIC UTILITIES,sector_TECHNOLOGY,sector_TRANSPORTATION
0,PFE,-1.004315,-1.003518,-0.944818,-1.001295,-1.006879,-0.13303,1970,-2.009363,-2.416307,...,False,False,False,False,False,True,False,False,False,False
1,PFE,-1.003554,-1.003366,-0.944818,-1.000369,-1.005979,-0.056129,1970,-2.009406,-2.41607,...,False,False,False,False,False,True,False,False,False,False
2,PFE,-1.003402,-1.003214,-0.944818,-1.000524,-1.006279,-0.145524,1970,-2.009449,-2.415798,...,False,False,False,False,False,True,False,False,False,False
3,PFE,-1.002946,-1.002302,-0.944817,-0.999598,-1.005079,-0.05699,1970,-2.00919,-2.415543,...,False,False,False,False,False,True,False,False,False,False
4,PFE,-1.001882,-1.001238,-0.944817,-0.998519,-1.004478,-0.01477,1970,-2.008454,-2.415161,...,False,False,False,False,False,True,False,False,False,False


In [8]:
# Define the Exponential Moving Average using the Exponential Weighted Moving Average
def EMA(series, span):
    return series.ewm(span=span, adjust=False).mean()

df['EMA12'] = EMA(df['close'], 12)
df['EMA26'] = EMA(df['close'], 26)
df['MACD'] = df['EMA12'] - df['EMA26']
df['Signal_Line'] = EMA(df['MACD'], 9)

# Create the signal columns based on the Moving Average Convergence Divergence (MACD)
df['MACD_Buy_Signal'] = (df['MACD'] > df['Signal_Line']) & (df['MACD'].shift(1) <= df['Signal_Line'].shift(1))
df['MACD_Sell_Signal'] = (df['MACD'] < df['Signal_Line']) & (df['MACD'].shift(1) >= df['Signal_Line'].shift(1))

In [9]:
# Calculate the Relative Strength Index (RSI)
delta = df['close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.ewm(span=14, adjust=False).mean()
avg_loss = loss.ewm(span=14, adjust=False).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

# Create the signal columns based on RSI
df['RSI_Buy_Signal'] = df['RSI'] < 30
df['RSI_Sell_Signal'] = df['RSI'] > 70

In [10]:
# Combine Signals into Final Trading Recommendation
def generate_signal(row):
    if row['MACD_Buy_Signal'] or row['RSI_Buy_Signal']:
        return 'Buy'
    elif row['MACD_Sell_Signal'] or row['RSI_Sell_Signal']:
        return 'Sell'
    else:
        return 'Hold'

df['Signal'] = df.apply(generate_signal, axis=1)

In [11]:
# Define features and target
X = df[['MACD', 'Signal_Line', 'RSI']]
y = df['Signal']

# Split the df into train, test and validate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [12]:
# Drop NaN resulting from feature engineering
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)
X_val.dropna(inplace=True)

# Ensure that the data is aligned after dropping NAN
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]
y_val = y_val[X_val.index]

In [13]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [14]:
# Train the Logistics Regression model
lr = LogisticRegression(max_iter=100)
lr.fit(X_train_scaled, y_train)

In [15]:
# Evaluate the Logistic Regression model
y_val_pred_lr = lr.predict(X_val_scaled)

print("Logistic Regression Performance on Validation Set:")
print(classification_report(y_val, y_val_pred_lr))

Logistic Regression Performance on Validation Set:
              precision    recall  f1-score   support

         Buy       0.66      0.46      0.54      8483
        Hold       0.76      0.93      0.84     41819
        Sell       0.41      0.08      0.13      8687

    accuracy                           0.74     58989
   macro avg       0.61      0.49      0.50     58989
weighted avg       0.69      0.74      0.69     58989



In [16]:
# Train the RandomForestClassifier model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

In [17]:
# Evaluate the RandomForestClassifier model
y_val_pred_rf = rf.predict(X_val_scaled)

print("\nRandom Forest Classifier Performance on Validation Set:")
print(classification_report(y_val, y_val_pred_rf))


Random Forest Classifier Performance on Validation Set:
              precision    recall  f1-score   support

         Buy       0.78      0.81      0.80      8483
        Hold       0.91      0.92      0.92     41819
        Sell       0.81      0.76      0.79      8687

    accuracy                           0.88     58989
   macro avg       0.84      0.83      0.83     58989
weighted avg       0.88      0.88      0.88     58989



In [18]:
# Train the SVC model
svc = SVC(random_state=42)
svc.fit(X_train_scaled, y_train)

In [19]:
# Evaluate the SVC model
y_val_pred_svc = svc.predict(X_val_scaled)

print("SVC Performance on Validation Set:")
print(classification_report(y_val, y_val_pred_svc))

SVC Performance on Validation Set:
              precision    recall  f1-score   support

         Buy       0.74      0.79      0.77      8483
        Hold       0.91      0.89      0.90     41819
        Sell       0.73      0.76      0.74      8687

    accuracy                           0.85     58989
   macro avg       0.79      0.81      0.80     58989
weighted avg       0.86      0.85      0.86     58989



## Summary:

### Data Analysis Key Findings

* The Logistic Regression model achieved an accuracy of approximately 0.73, a weighted precision of approximately 0.69, a weighted recall of approximately 0.73, and a weighted F1-score of approximately 0.68 on the validation set.
* The RandomForestClassifier model achieved an accuracy of approximately 0.88, a weighted precision of approximately 0.88, a weighted recall of approximately 0.88, and a weighted F1-score of approximately 0.88 on the validation set.
* The SVC model achieved an accuracy of approximately 0.85, a weighted precision of approximately 0.85, a weighted recall of approximately 0.85, and a weighted F1-score of approximately 0.85 on the validation set.
* Based on the validation set performance, the RandomForestClassifier was selected as the best model.
* The selected RandomForestClassifier model achieved an accuracy of approximately 0.88 on the test set, with weighted average precision, recall, and F1-score all around 0.88.

### Insights or Next Steps

* The RandomForestClassifier appears to be the most suitable model among those tested for this classification task.
* Further hyperparameter tuning of the RandomForestClassifier could potentially improve performance.