In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

In [2]:
df = pd.read_csv('./tweet data/merged_df.csv')
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Tweet,Company Name,Tweet Length,Sentiment,Sentiment_score,Z-Score
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,#LottoFriday Watchlist: short &amp; sweet\n\n$...,"Tesla, Inc.",240,Positive,0.8478,1.487776
1,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,CORRECTION UPDATE\n\nUPDATE on Q3 Delivery Est...,"Tesla, Inc.",296,Neutral,-0.1531,-0.747275
2,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,FREE #OPTIONS Ideas 🤯\n\nScale out when above ...,"Tesla, Inc.",317,Positive,0.9083,1.622875
3,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,California DMV today issued autonomous vehicle...,"Tesla, Inc.",272,Positive,0.0000,-0.405396
4,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,@chamath Appreciate the clarification @chamath...,"Tesla, Inc.",196,Positive,0.4019,0.492063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48295,2022-08-30,18.280001,18.600000,17.950001,18.260000,18.260000,14484300,XPEV,"3 days before months end, license plate regist...",XPeng Inc.,258,Positive,-0.3182,-1.115950
48296,2022-09-07,15.750000,16.530001,15.630000,16.000000,16.000000,14641500,XPEV,"Insurance registrations: NIO sells 2,882 vehic...",XPeng Inc.,136,Neutral,0.0000,-0.405396
48297,2022-09-12,16.549999,16.639999,15.980000,16.360001,16.360001,21170100,XPEV,Notice how Chinese stocks rallied when the US ...,XPeng Inc.,259,Positive,0.1680,-0.030245
48298,2022-09-19,14.300000,15.725000,14.290000,15.640000,15.640000,18193100,XPEV,Analysts are projecting these 10 tech stocks t...,XPeng Inc.,193,Neutral,0.3818,0.447179


In [3]:

unwanted_columns = ['Tweet', 'Company Name', 'Tweet Length']
df = df.drop(unwanted_columns, axis=1)

In [4]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Sentiment,Sentiment_score,Z-Score
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.8478,1.487776
1,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Neutral,-0.1531,-0.747275
2,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.9083,1.622875
3,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.0,-0.405396
4,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.4019,0.492063


In [5]:
df['Date'] = pd.to_datetime(df['Date'])
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [8]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Sentiment,Sentiment_score,Z-Score,DayOfWeek,Anomaly
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.8478,1.487776,3,1
1,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Neutral,-0.1531,-0.747275,3,1
2,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.9083,1.622875,3,1
3,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.0,-0.405396,3,1
4,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,Positive,0.4019,0.492063,3,1


In [7]:
# Anomaly detection using Isolation Forest
def anomaly_detection(data):
    model = IsolationForest(contamination=0.05, random_state=42)
    model.fit(data[['Close']])
    data['Anomaly'] = model.predict(data[['Close']])
    return data

df = anomaly_detection(df)


In [40]:

# Feature scaling
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[['Open', 'High', 'Low', 'Close', 'Volume', 'Sentiment_score', 'Z-Score']])

In [41]:
# Split the dataset into train and test sets
train_size = int(len(df) * 0.8)
train_data, test_data = scaled_data[:train_size], scaled_data[train_size:]

In [42]:
train_data

array([[0.20373686, 0.20426687, 0.20513183, ..., 0.2689412 , 0.92482596,
        0.92482596],
       [0.20373686, 0.20426687, 0.20513183, ..., 0.2689412 , 0.42354886,
        0.42354886],
       [0.20373686, 0.20426687, 0.20513183, ..., 0.2689412 , 0.95512596,
        0.95512596],
       ...,
       [0.18315607, 0.18515904, 0.1799094 , ..., 0.50596486, 0.50022537,
        0.50022537],
       [0.18315607, 0.18515904, 0.1799094 , ..., 0.50596486, 0.68983823,
        0.68983823],
       [0.18315607, 0.18515904, 0.1799094 , ..., 0.50596486, 0.50022537,
        0.50022537]])

In [43]:
# Function to create sequences for time series forecasting
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, :])
        y.append(data[i+seq_length, 3])  # Assuming 'Close' is the target variable
    return np.array(X), np.array(y)

In [44]:


# Create sequences for LSTM model
seq_length = 10 
X_train, y_train = create_sequences(train_data, seq_length)
X_test, y_test = create_sequences(test_data, seq_length)

In [45]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((38630, 10, 7), (38630,), (9650, 10, 7), (9650,))

In [37]:
X_train[0]

array([[0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598],
       [0.2029598]])

In [38]:
y_train[0]

array([0.2029598])

In [14]:

# Build the LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model_lstm.add(LSTM(units=50, activation='relu', return_sequences=True))
model_lstm.add(LSTM(units=50, activation='relu'))
model_lstm.add(Dense(units=1))  # Output layer

In [15]:


model_lstm.compile(optimizer='adam', loss='mean_squared_error')
model_lstm.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1a4cb035340>

In [16]:
import pickle
filename = 'lstm_model.sav'
pickle.dump(model_lstm, open(filename, 'wb'))

In [17]:

# Build the Random Forest model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train.reshape(X_train.shape[0], -1), y_train)


In [18]:
pickle.dump(model_rf, open('rf_model.sav', 'wb'))

In [23]:
X_test.shape

(9650, 10, 7)

In [46]:
def evaluate_model(model, X, y, scaler):
    # Assuming X has shape (batch_size, time_steps, features)
    y_pred = model.predict(X)
    
    # Inverse transform the scaled values to original scale
    #y_true = scaler.inverse_transform(y)  # No need to reshape
    #y_pred_actual = scaler.inverse_transform(y_pred)  # No need to reshape
    y_true = scaler.inverse_transform(y.reshape(-1, 1))
    y_pred_actual = scaler.inverse_transform(y_pred.reshape(-1, 1))


    
    mae = mean_absolute_error(y_true, y_pred_actual)
    mse = mean_squared_error(y_true, y_pred_actual)
    
    return mae, mse

#mae_lstm, mse_lstm = evaluate_model(model_lstm, X_test, y_test, scaler)
mae_rf, mse_rf = evaluate_model(model_rf, X_test, y_test, scaler)


ValueError: Found array with dim 3. RandomForestRegressor expected <= 2.

In [None]:
# Print accuracy metrics
print(f"LSTM Model - MAE: {mae_lstm}, MSE: {mse_lstm}")
print(f"Random Forest Model - MAE: {mae_rf}, MSE: {mse_rf}")


In [48]:


# Visualize the forecast for the next 30 days

extended_data = np.concatenate((scaled_data[-seq_length:], np.zeros((30, scaled_data.shape[1]))))
for i in range(30):
    # Predict the next day using the last seq_length days
    input_data = np.expand_dims(extended_data[i:i+seq_length, :], axis=0)
    
    # Use the LSTM model for prediction
    lstm_prediction = model_lstm.predict(input_data)
    
    # Use the Random Forest model for prediction
    rf_prediction = model_rf.predict(input_data.reshape(1, -1))
    
  
    
    # Update the extended_data with the predicted values
    extended_data[i+seq_length, 3] = lstm_prediction
    extended_data[i+seq_length, 8] = rf_prediction
    extended_data[i+seq_length, 9] = ann_prediction

# Inverse transform the data to the original scale
forecast = scaler.inverse_transform(extended_data[:, 3].reshape(-1, 1))



IndexError: index 8 is out of bounds for axis 1 with size 7

In [None]:


# Plotting the forecast
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Close'], label='Historical Data', color='blue')
plt.plot(df['Date'].iloc[-1] + pd.to_timedelta(np.arange(30), 'D'), forecast, label='Forecast', color='red', linestyle='--')
plt.scatter(df[df['Anomaly'] == -1]['Date'], df[df['Anomaly'] == -1]['Close'], color='black', label='Anomalies')
plt.legend()
plt.title('Model Forecasting for the Next 30 Days')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.show()
