# Installing libraries

In [25]:
#!pip install pandas sqlalchemy pymysql pymongo
#!pip install numpy
#!pip install scikit-learn
#!pip install nltk
#!pip install pyspark
#!pip install pyspark pymongo
#!pip install --upgrade pymongo[tls,srv] pyspark
#!pip install nltk
#!pip install --upgrade tensorflow scikit-learn pandas numpy setuptools
#!pip install statsmodels
#!pip install matplotlib
#!pip install keras-tuner
#!pip install dash

# Data storing using MySLQ & MongoDB

### Check csv file

In [None]:
import pandas as pd
df = pd.read_csv('ProjectTweets.csv')
df

### Imports

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import pymongo
from pymongo import MongoClient

### Read & Write

In [None]:
csv_file_path = 'ProjectTweets.csv'
df = pd.read_csv(csv_file_path, encoding='latin1', names=["ids", "date", "flag", "user", "text"])

def load_to_mysql(df):
    mysql_user = 'fadiez'
    mysql_password = 'Hadoop123'
    mysql_host = 'localhost'
    mysql_db = 'CA2'
    engine = create_engine(f'mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}/{mysql_db}')

    df.to_sql('tweets', con=engine, if_exists='replace', index=False)
    print("Data loaded to MySQL successfully.")


def load_to_mongodb(df):
    mongo_host = 'localhost'
    mongo_port = 27017
    mongo_db = 'CA2'
    mongo_collection = 'tweets'
    
    client = MongoClient(mongo_host, mongo_port)
    
    db = client[mongo_db]
    collection = db[mongo_collection]
    
    data_dict = df.to_dict("records")
    collection.insert_many(data_dict)
    print("Data loaded to MongoDB successfully.")

load_to_mysql(df)
load_to_mongodb(df)

### Check results

In [None]:
from sqlalchemy import create_engine
from pymongo import MongoClient
import pandas as pd

mysql_user = 'root'
mysql_password = 'password'
mysql_host = 'localhost'
mysql_db = 'CA2'

mongo_host = 'localhost'
mongo_port = 27017
mongo_db = 'CA2'
mongo_collection = 'tweets'

def fetch_sample_from_mysql(sample_size=5):
    engine = create_engine(f'mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}/{mysql_db}')
    
    query = f"SELECT * FROM tweets LIMIT {sample_size}"
    df_mysql_sample = pd.read_sql(query, con=engine)
    return df_mysql_sample

def fetch_sample_from_mongodb(sample_size=5):
    client = MongoClient(mongo_host, mongo_port)
    
    db = client[mongo_db]
    collection = db[mongo_collection]
    
    cursor = collection.find({}).limit(sample_size)
    df_mongodb_sample = pd.DataFrame(list(cursor))
    return df_mongodb_sample

df_mysql_sample = fetch_sample_from_mysql()
print("Sample Data from MySQL:")
print(df_mysql_sample)

df_mongodb_sample = fetch_sample_from_mongodb()
print("\nSample Data from MongoDB:")
print(df_mongodb_sample)


# Sentiment Analysis

## Data preprocessing using PySpark

### Read data from MongoDB

In [None]:
import pandas as pd
from pymongo import MongoClient
from dateutil import parser
import re

mongo_uri = "mongodb://localhost:27017/"
mongo_db = "CA2"
mongo_collection_input = "tweets"
mongo_collection_output = "cleaned_tweets"

client = MongoClient(mongo_uri)
db = client[mongo_db]
collection_input = db[mongo_collection_input]

tweets = list(collection_input.find({}, {"text": 1, "date": 1, "_id": 0}))
df = pd.DataFrame(tweets)

### Text cleaning & date->timestamp conversion

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    return text

def convert_date_to_timestamp(date_str):
    try:
        date_obj = parser.parse(date_str)
        return date_obj.timestamp()
    except Exception as e:
        print(f"Error parsing date: {e}")
        return None

df['text'] = df['text'].apply(clean_text)

df['timestamp'] = df['date'].apply(convert_date_to_timestamp)

df = df.drop(columns=['date'])

### Cleaned data to MongoDB

In [None]:
collection_output = db[mongo_collection_output]
collection_output.insert_many(df.to_dict(orient='records'))

client.close()

### Check inserted

In [None]:
from pymongo import MongoClient

mongo_uri = "mongodb://localhost:27017/"
mongo_db = "CA2"
mongo_collection = "cleaned_tweets"

client = MongoClient(mongo_uri)
db = client[mongo_db]
collection = db[mongo_collection]

document_count = collection.count_documents({})
print(f"Number of documents in the collection {mongo_collection}: {document_count}")

for document in collection.find().limit(5):
    print(document)

client.close()

### Save 2 CSV

In [None]:
df.to_csv('cleaned.csv', index=False)

## SA using Vader, write to MongoDB

In [None]:
import nltk
nltk.download('vader_lexicon')
from pymongo import MongoClient
from nltk.sentiment.vader import SentimentIntensityAnalyzer

mongo_uri = "mongodb://localhost:27017/"
mongo_db = "CA2"
mongo_source_collection = "cleaned_tweets"
mongo_target_collection = "sentiment_analysis"

client = MongoClient(mongo_uri)
db = client[mongo_db]
source_collection = db[mongo_source_collection]
target_collection = db[mongo_target_collection]

sid = SentimentIntensityAnalyzer()

total_documents = source_collection.count_documents({})

for idx, document in enumerate(source_collection.find(), start=1):
    text = document['text']
    scores = sid.polarity_scores(text)
    
    existing_document = target_collection.find_one({'_id': document['_id']})
    if existing_document:
        target_collection.update_one({'_id': document['_id']}, {'$set': {'vader_score': scores}})
    else:
        target_collection.insert_one({'_id': document['_id'], 'vader_score': scores})
    
    if idx % (total_documents // 100) == 0:
        print(f"Processed {idx / total_documents * 100:.2f}% of documents")

client.close()

### Check inserted

In [None]:
from pymongo import MongoClient

mongo_uri = "mongodb://localhost:27017/"
mongo_db = "CA2"
mongo_collection = "sentiment_analysis"

client = MongoClient(mongo_uri)
db = client[mongo_db]
collection = db[mongo_collection]

first_10_documents = list(collection.find().sort('_id', 1).limit(10))
print("First 10 documents:")
for doc in first_10_documents:
    print(doc)

last_10_documents = list(collection.find().sort('_id', -1).limit(10))
print("\nLast 10 documents:")
for doc in last_10_documents:
    print(doc)

client.close()

### Merge data from Mongo collections

In [None]:
import pandas as pd
from pymongo import MongoClient

mongo_uri = "mongodb://localhost:27017/"
mongo_db = "CA2"
mongo_source_collection = "cleaned_tweets"
mongo_target_collection = "sentiment_analysis"

client = MongoClient(mongo_uri)
db = client[mongo_db]
source_collection = db[mongo_source_collection]
target_collection = db[mongo_target_collection]

cleaned_tweets_df = pd.DataFrame(list(source_collection.find()))
sentiment_analysis_df = pd.DataFrame(list(target_collection.find()))

cleaned_tweets_df['_id'] = cleaned_tweets_df['_id'].astype(str)
sentiment_analysis_df['_id'] = sentiment_analysis_df['_id'].astype(str)

merged_df = pd.merge(cleaned_tweets_df, sentiment_analysis_df, on='_id')

keys = set()
merged_df['vader_score'].apply(keys.update)
for key in keys:
    merged_df[key] = merged_df['vader_score'].apply(lambda x: x.get(key, None))

merged_df.drop(columns=['vader_score'], inplace=True)

client.close()

merged_df

### Save merged data to CSV

In [None]:
merged_df.to_csv('merged.csv', index=False)

### Read merged data & Check

In [None]:
df = pd.read_csv('merged.csv')
df

## Training model for Sentiment Analises

### Additional text cleaning

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    clean_text = ' '.join(tokens)
    return clean_text

df['clean_text'] = df['text'].apply(clean_text)

print(df[['text', 'clean_text']])

### (Write &) Read results

In [None]:
import pandas as pd
#df.to_csv('cleaned2.csv', index=False)
df = pd.read_csv('cleaned2.csv')
df

### Imports

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

### Preparation

In [None]:
def get_sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['label'] = df['compound'].apply(get_sentiment_label)
print(df['label'].unique())
df = pd.concat([df, pd.get_dummies(df['label'])], axis=1)

print(df[['Negative', 'Neutral', 'Positive']].shape)

df = df.dropna(subset=['clean_text'])

X = df['clean_text'].values
y = df[['Negative', 'Neutral', 'Positive']].values

print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

vect = CountVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_vect)
X_test_tfidf = tfidf.transform(X_test_vect)


In [None]:
print(df.columns)

### Model creating & training

In [None]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
model.add(Dropout(0.5))

model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

history = model.fit(X_train_tfidf, y_train, batch_size=128, epochs=3, validation_data=(X_test_tfidf, y_test), callbacks=[early_stop], verbose=1)

### Check accuracy

In [None]:
loss, accuracy = model.evaluate(X_test_tfidf, y_test, verbose=1)
print(f'Test Accuracy: {accuracy}')

y_pred_probs = model.predict(X_test_tfidf)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

print(classification_report(y_test_labels, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

print(confusion_matrix(y_test_labels, y_pred))

# Mean sentiment predictions

In [None]:
df

In [None]:
df_copy = df
import pandas as pd
#df.to_csv('cleaned2.csv', index=False)
mydf = pd.read_csv('cleaned2.csv')
df = mydf

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

df = df.dropna(subset=['clean_text'])

df['label'] = df['compound'].apply(get_sentiment_label)
df = pd.concat([df, pd.get_dummies(df['label'])], axis=1)

num_samples = int(len(df) * 0.1)
df = df.sample(n=num_samples, random_state=42)

X = df['clean_text'].values
y = df[['Negative', 'Neutral', 'Positive']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

vect = CountVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_vect)
X_test_tfidf = tfidf.transform(X_test_vect)

model = Sequential()

model.add(Dense(units=512, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
model.add(Dropout(0.5))

model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

history = model.fit(X_train_tfidf, y_train, batch_size=128, epochs=3, validation_data=(X_test_tfidf, y_test), callbacks=[early_stop], verbose=1)

loss, accuracy = model.evaluate(X_test_tfidf, y_test, verbose=1)
print(f'Test Accuracy: {accuracy}')

y_pred_probs = model.predict(X_test_tfidf)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

print(classification_report(y_test_labels, y_pred, target_names=['Negative', 'Neutral', 'Positive']))
print(confusion_matrix(y_test_labels, y_pred))

In [None]:
df

## ARIMA forecast

### Preparation

In [1]:
import pandas as pd
#df.to_csv('cleaned2.csv', index=False)
df = pd.read_csv('cleaned2.csv')
df

Unnamed: 0,_id,text,timestamp,neu,neg,pos,compound,clean_text
0,664aa4052b91f744d139db03,switchfoot awww thats a bummer you shoulda ...,1.239053e+09,0.843,0.157,0.000,-0.3818,switchfoot awww thats bummer shoulda got david...
1,664aa4052b91f744d139db04,is upset that he cant update his facebook by t...,1.239053e+09,0.705,0.295,0.000,-0.7269,upset cant update facebook texting might cry r...
2,664aa4052b91f744d139db05,kenichan i dived many times for the ball manag...,1.239053e+09,0.833,0.000,0.167,0.4939,kenichan dived many time ball managed save res...
3,664aa4052b91f744d139db06,my whole body feels itchy and like its on fire,1.239053e+09,0.500,0.321,0.179,-0.2500,whole body feel itchy like fire
4,664aa4052b91f744d139db07,nationwideclass no its not behaving at all im ...,1.239053e+09,0.759,0.241,0.000,-0.6597,nationwideclass behaving im mad cant see
...,...,...,...,...,...,...,...,...
1599995,664aa46a2b91f744d15244fe,just woke up having no school is the best feel...,1.245138e+09,0.503,0.138,0.358,0.5423,woke school best feeling ever
1599996,664aa46a2b91f744d15244ff,thewdbcom very cool to hear old walt intervie...,1.245138e+09,0.755,0.000,0.245,0.3804,thewdbcom cool hear old walt interview
1599997,664aa46a2b91f744d1524500,are you ready for your mojo makeover ask me fo...,1.245138e+09,0.800,0.000,0.200,0.3612,ready mojo makeover ask detail
1599998,664aa46a2b91f744d1524501,happy 38th birthday to my boo of alll time tup...,1.245138e+09,0.748,0.000,0.252,0.5719,happy th birthday boo alll time tupac amaru sh...


In [2]:
forecasting_df = df[['timestamp', 'compound']]
forecasting_df

Unnamed: 0,timestamp,compound
0,1.239053e+09,-0.3818
1,1.239053e+09,-0.7269
2,1.239053e+09,0.4939
3,1.239053e+09,-0.2500
4,1.239053e+09,-0.6597
...,...,...
1599995,1.245138e+09,0.5423
1599996,1.245138e+09,0.3804
1599997,1.245138e+09,0.3612
1599998,1.245138e+09,0.5719


In [3]:
import pandas as pd

aggregated_df = forecasting_df.groupby(pd.to_datetime(forecasting_df['timestamp'], unit='s').dt.date).mean()

aggregated_df['frequency_index'] = forecasting_df.groupby(pd.to_datetime(forecasting_df['timestamp'], unit='s').dt.date).size()
aggregated_df

Unnamed: 0_level_0,timestamp,compound,frequency_index
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-04-06,1239058000.0,0.143861,5300
2009-04-07,1239077000.0,0.150729,15371
2009-04-17,1240000000.0,0.189842,3084
2009-04-18,1240067000.0,0.169625,23769
2009-04-19,1240130000.0,0.173334,27447
2009-04-20,1240215000.0,0.161552,18447
2009-04-21,1240278000.0,0.165187,6607
2009-05-01,1241214000.0,0.16578,9720
2009-05-02,1241256000.0,0.169758,27376
2009-05-03,1241351000.0,0.160098,35405


In [4]:
aggregated_df.shape[0]

48

### Model creation, hyperparameters tuning

In [5]:
import itertools
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate_arima_model(data, order):
    train_size = int(len(data) * 0.8)
    train, test = data[:train_size], data[train_size:]
    history = [x for x in train]
    predictions = []
    for t in range(len(test)):
        model = ARIMA(history, order=order)
        model_fit = model.fit()
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    rmse = calculate_rmse(test, predictions)
    return rmse

def find_best_arima_model(data, p_values, d_values, q_values):
    best_score, best_cfg = float("inf"), None
    for p, d, q in itertools.product(p_values, d_values, q_values):
        order = (p, d, q)
        try:
            rmse = evaluate_arima_model(data, order)
            if rmse < best_score:
                best_score, best_cfg = rmse, order
            print('ARIMA%s RMSE=%.3f' % (order, rmse))
        except:
            continue
    print('Best ARIMA%s RMSE=%.3f' % (best_cfg, best_score))
    return best_cfg

p_values = range(0, 3)
d_values = range(0, 3)
q_values = range(0, 3)

sentiment_data = aggregated_df['compound']

best_params = find_best_arima_model(sentiment_data, p_values, d_values, q_values)
print(best_params)

  history.append(test[t])


ARIMA(0, 0, 0) RMSE=0.190


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 0, 1) RMSE=0.115


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'


ARIMA(0, 0, 2) RMSE=0.097


  history.append(test[t])


ARIMA(0, 1, 0) RMSE=0.049


  history.append(test[t])


ARIMA(0, 1, 1) RMSE=0.073


  history.append(test[t])


ARIMA(0, 1, 2) RMSE=0.067


  history.append(test[t])
  history.append(test[t])


ARIMA(0, 2, 0) RMSE=0.050
ARIMA(0, 2, 1) RMSE=0.054


  history.append(test[t])


ARIMA(0, 2, 2) RMSE=0.059


  history.append(test[t])
  warn('Non-stationary starting autoregressive parameters'


ARIMA(1, 0, 0) RMSE=0.068


  history.append(test[t])
  warn('Non-stationary starting autoregressive parameters'


ARIMA(1, 0, 1) RMSE=0.070


  history.append(test[t])
  history.append(test[t])


ARIMA(1, 0, 2) RMSE=0.074
ARIMA(1, 1, 0) RMSE=0.055


  history.append(test[t])


ARIMA(1, 1, 1) RMSE=0.066


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'


ARIMA(1, 1, 2) RMSE=0.079


  history.append(test[t])


ARIMA(1, 2, 0) RMSE=0.055


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'


ARIMA(1, 2, 1) RMSE=0.057


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'


ARIMA(1, 2, 2) RMSE=0.057


  history.append(test[t])


ARIMA(2, 0, 0) RMSE=0.069


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  history.append(test[t])


ARIMA(2, 0, 1) RMSE=0.072


  history.append(test[t])


ARIMA(2, 0, 2) RMSE=0.075


  history.append(test[t])


ARIMA(2, 1, 0) RMSE=0.057


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  history.append(test[t])


ARIMA(2, 1, 1) RMSE=0.066


  warn('Non-invertible starting MA parameters found.'
  history.append(test[t])
  warn('Non-stationary starting autoregressive parameters'


ARIMA(2, 1, 2) RMSE=0.067


  history.append(test[t])


ARIMA(2, 2, 0) RMSE=0.058


  history.append(test[t])
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


ARIMA(2, 2, 1) RMSE=0.058


  warn('Non-invertible starting MA parameters found.'
  history.append(test[t])


ARIMA(2, 2, 2) RMSE=0.058
Best ARIMA(0, 1, 0) RMSE=0.049
(0, 1, 0)


### Training model with best hyperparameters

In [6]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

def build_arima_model(data, order):
    model = ARIMA(data, order=order)
    model_fit = model.fit()
    return model_fit

def forecast_sentiment(model_fit, steps):
    forecast = model_fit.forecast(steps=steps)
    return forecast

sentiment_data = aggregated_df['compound']

arima_model = build_arima_model(sentiment_data, order=(0, 2, 0))

forecast_7_days = forecast_sentiment(arima_model, steps=7)

last_date = aggregated_df.index[-1]

forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=7)

arima_df = pd.DataFrame({'timestamp': forecast_dates, 'compound': forecast_7_days})

arima_df

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Unnamed: 0,timestamp,compound
48,2009-06-26,-0.066343
49,2009-06-27,-0.073789
50,2009-06-28,-0.081235
51,2009-06-29,-0.08868
52,2009-06-30,-0.096126
53,2009-07-01,-0.103572
54,2009-07-02,-0.111018


## LSTM forecasting

### Imports

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
import shutil
import os

### Preparation

In [9]:
sentiment_data = aggregated_df['compound'].values.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(sentiment_data)

def prepare_lstm_data(data, n_steps):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps
        if end_ix > len(data)-1:
            break
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps = 7
X, y = prepare_lstm_data(scaled_data, n_steps)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

### Model creation & tuning

In [10]:
def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=128, step=32),
                   activation=hp.Choice('activation', values=['relu', 'tanh', 'sigmoid']),
                   input_shape=(n_steps, 1)))
    model.add(Dense(units=1))
    model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop']),
                  loss=hp.Choice('loss', values=['mean_squared_error', 'mean_absolute_error']))
    return model

tuner_directory = 'lstm_tuning_new'
if os.path.exists(tuner_directory):
    shutil.rmtree(tuner_directory)

tuner = RandomSearch(
    build_lstm_model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=3,
    directory=tuner_directory,
    project_name='sentiment_analysis'
)

tuner.search(X_train, y_train,
             epochs=10,
             validation_data=(X_test, y_test))

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best hyperparameters:")
print(f"Units: {best_hps.get('units')}")
print(f"Activation: {best_hps.get('activation')}")
print(f"Optimizer: {best_hps.get('optimizer')}")
print(f"Loss: {best_hps.get('loss')}")

Trial 5 Complete [00h 00m 07s]
val_loss: 0.0028788212997217975

Best val_loss So Far: 0.0028788212997217975
Total elapsed time: 00h 00m 39s
Best hyperparameters:
Units: 32
Activation: tanh
Optimizer: sgd
Loss: mean_squared_error


### Model training & 7 day forecast

In [11]:
model = Sequential()
model.add(LSTM(units=best_hps.get('units'), 
               activation=best_hps.get('activation'), 
               input_shape=(n_steps, 1)))
model.add(Dense(units=1))
model.compile(optimizer=best_hps.get('optimizer'), 
              loss=best_hps.get('loss'))

model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=1)

forecast = []
input_data = scaled_data[-n_steps:].reshape((1, n_steps, 1))
for i in range(7):
    prediction = model.predict(input_data, verbose=0)
    forecast.append(prediction[0][0])
    input_data = np.append(input_data[:,1:,:], prediction.reshape(1,1,1), axis=1)

forecast = scaler.inverse_transform(np.array(forecast).reshape(-1,1))

forecast_dates = pd.date_range(start=aggregated_df.index[-1] + pd.Timedelta(days=1), periods=7)
lstm_df = pd.DataFrame({'timestamp': forecast_dates, 'compound': forecast.flatten()})

lstm_df

Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 621ms/step - loss: 1.0769
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.8988
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.7505
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.6267
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.5233
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.4369
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.3647
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.3044
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.2540
Epoch 10/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.2120
Epoch 11

Unnamed: 0,timestamp,compound
0,2009-06-26,0.10262
1,2009-06-27,0.114459
2,2009-06-28,0.123827
3,2009-06-29,0.131958
4,2009-06-30,0.139741
5,2009-07-01,0.146245
6,2009-07-02,0.15172


### Forecasts visualisation

In [24]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output

# Підготовка даних та прогнозів (як раніше було описано)
# ... (ваш код для підготовки даних та побудови моделей ARIMA та LSTM, а також отримання прогнозів)

# Припустимо, що у вас вже є aggregated_df з оригінальними даними
# arima_df та lstm_df з прогнозованими даними від ARIMA та LSTM відповідно

# Створення додатку Dash
app = Dash(__name__)

app.layout = html.Div([
    dcc.Checklist(
        id='forecast-selection',
        options=[
            {'label': 'ARIMA Forecast', 'value': 'ARIMA'},
            {'label': 'LSTM Forecast', 'value': 'LSTM'}
        ],
        value=['ARIMA', 'LSTM'],
        labelStyle={'display': 'inline-block'}
    ),
    dcc.Dropdown(
        id='forecast-days',
        options=[
            {'label': '1 Day', 'value': 1},
            {'label': '3 Days', 'value': 3},
            {'label': '7 Days', 'value': 7}
        ],
        value=1
    ),
    dcc.Graph(id='sentiment-forecast-graph')
])

@app.callback(
    Output('sentiment-forecast-graph', 'figure'),
    Input('forecast-selection', 'value'),
    Input('forecast-days', 'value')
)
def update_graph(selected_forecasts, forecast_days):
    fig = go.Figure()
    
    # Додавання оригінальних даних
    fig.add_trace(go.Scatter(x=aggregated_df.index, y=aggregated_df['compound'], 
                             mode='lines', name='Original Data', line=dict(color='blue')))
    
    # Додавання прогнозів відповідно до вибору
    if 'ARIMA' in selected_forecasts:
        arima_df_subset = arima_df.iloc[:forecast_days]
        fig.add_trace(go.Scatter(x=arima_df_subset['timestamp'], y=arima_df_subset['compound'], 
                                 mode='lines', name='ARIMA Forecast', line=dict(color='green')))
    
    if 'LSTM' in selected_forecasts:
        lstm_df_subset = lstm_df.iloc[:forecast_days]
        fig.add_trace(go.Scatter(x=lstm_df_subset['timestamp'], y=lstm_df_subset['compound'], 
                                 mode='lines', name='LSTM Forecast', line=dict(color='red')))
    
    # Налаштування осей та заголовків
    fig.update_layout(title='Original Data vs Forecasts',
                      xaxis_title='Date',
                      yaxis_title='Compound Sentiment',
                      hovermode='x unified')

    # Додавання слайдерів діапазону
    fig.update_xaxes(rangeslider_visible=True)

    return fig

if __name__ == '__main__':
    app.run_server(debug=True)
