In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler

In [50]:
aapl_finance_df = pd.read_csv('../AAPL Data/AAPL_finance_data.csv')
aapl_sentiment_df = pd.read_csv('../AAPL Data/AAPL_avg_sentiment_data.csv')
merged_aapl_df = pd.merge(aapl_finance_df, aapl_sentiment_df, on='Date', how='inner')
merged_aapl_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name_x,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment_compound,Stock Name_y,sentiment_label
0,2021-09-30,143.660004,144.380005,141.279999,141.5,140.478485,89056700,AAPL,0.051286,0.851143,0.097571,0.0989,AAPL,Positive
1,2021-10-01,141.899994,142.919998,139.110001,142.649994,141.620163,94639600,AAPL,0.024455,0.872455,0.103182,0.248255,AAPL,Positive
2,2021-10-04,141.759995,142.210007,138.270004,139.139999,138.135513,98322000,AAPL,0.0309,0.91,0.0593,0.12283,AAPL,Positive
3,2021-10-05,139.490005,142.240005,139.360001,141.110001,140.091278,80861100,AAPL,0.0422,0.89,0.0678,0.331,AAPL,Positive
4,2021-10-06,139.470001,142.149994,138.369995,142.0,140.974869,83221100,AAPL,0.0122,0.9108,0.077,0.24352,AAPL,Positive


In [None]:
# List of sentiment columns
sentiment_cols = ['sentiment_neutral', 'sentiment_compound', 'sentiment_negative', 'sentiment_positive']

# Create lagged features for 1 and 2 days
for col in sentiment_cols:
    merged_aapl_df[f'{col}_lag1'] = merged_aapl_df[col].shift(1)
    merged_aapl_df[f'{col}_lag2'] = merged_aapl_df[col].shift(2)

# Drop rows with NaN values created by the lagging
merged_aapl_df = merged_aapl_df.dropna()

# Check the new columns
print(merged_aapl_df.head())

# Step 1: Map labels to numeric
label_mapping = {'Positive': 1, 'Negative': 0, 'Neutral': 0}  # treat Neutral as 0 (down)
merged_aapl_df['sentiment_label_numeric'] = merged_aapl_df['sentiment_label'].map(label_mapping)

# Step 2: Drop any rows where mapping failed
merged_aapl_df = merged_aapl_df.dropna(subset=['sentiment_label_numeric'])


         Date        Open        High         Low       Close   Adj Close  \
2  2021-10-04  141.759995  142.210007  138.270004  139.139999  138.135513   
3  2021-10-05  139.490005  142.240005  139.360001  141.110001  140.091278   
4  2021-10-06  139.470001  142.149994  138.369995  142.000000  140.974869   
5  2021-10-07  143.059998  144.220001  142.720001  143.289993  142.255554   
6  2021-10-08  144.029999  144.179993  142.559998  142.899994  141.868362   

     Volume Stock Name_x  sentiment_negative  sentiment_neutral  ...  \
2  98322000         AAPL            0.030900           0.910000  ...   
3  80861100         AAPL            0.042200           0.890000  ...   
4  83221100         AAPL            0.012200           0.910800  ...   
5  61732700         AAPL            0.019125           0.876375  ...   
6  58773200         AAPL            0.006571           0.888429  ...   

   Stock Name_y  sentiment_label sentiment_neutral_lag1  \
2          AAPL         Positive             

In [53]:
df = merged_aapl_df.copy()

df['price_change'] = df['Close'].shift(-1) - df['Close']
df['target'] = (df['price_change'] > 0).astype(int)
df = df.dropna(subset=['target'])

features = [
    'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
    'sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment_compound',
    'sentiment_negative_lag1', 'sentiment_negative_lag2',
    'sentiment_neutral_lag1', 'sentiment_neutral_lag2',
    'sentiment_positive_lag1', 'sentiment_positive_lag2',
    'sentiment_compound_lag1', 'sentiment_compound_lag2'
]

X = df[features]
y = df['target'].astype(float)  # SVR needs numeric target

# Scale features for SVR
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score

# Step 5: Train/test split
from sklearn.model_selection import train_test_split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, shuffle=False
)

# Step 6: Train SVR
from sklearn.svm import SVR
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')
svr.fit(X_train_scaled, y_train)

# Step 7: Predict and convert to binary
y_pred_cont = svr.predict(X_test_scaled)
y_pred = (y_pred_cont > 0.5).astype(int)

# Step 8: Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"SVR Accuracy after mapping labels: {accuracy:.2%}")

SVR Accuracy after mapping labels: 60.00%
