# TIME SERIES FORECASTING OF PROJECT TWEETS BIG DATA PROCESSED WITH SPARK AND STORED IN MONGODB

# Dataset
The dataste is a large dataset gleaned from the twitter API that is called ProjectTweets.csv.

This dataset contains 1,600,000 tweets extracted using the twitter api. 


Content
It contains the following 5 fields:
- ids: The id of the tweet (eg. 4587)
- date: the date of the tweet (eg. Sat May 16 23:58:44 UTC 2009)
- flag: The query (eg. lyx). If there is no query, then this value is NO_QUERY.
- user: the user that tweeted (eg. bobthebuilder)
- text: the text of the tweet (eg. Lyx is cool)

## Install all Required Libraries

In [1]:
#Import all necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import col, lower, regexp_replace, trim, split, udf
from pyspark.sql.functions import isnull, to_timestamp
from pyspark.sql.functions import udf
from pyspark.sql.functions import to_timestamp
from pyspark.sql.types import StructType, StringType, TimestampType
from pyspark.sql.functions import col
import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

In [2]:
#Creating a pyspark session connecting to mongodb
uri = "mongodb://172.17.0.8:27017/DeeProject_mongo.Tweets"

spark = SparkSession.builder.appName("Write into MongoDB")\
    .config("spark.mongodb.input.uri", uri)\
    .config("spark.mongodb.output.uri", uri)\
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.12:2.4.2')\
    .getOrCreate()

In [3]:
#Spark content
spark

# Step one: Writing Data into MongoDB using Apache Spark

## Loading data from local machine to SParkSQL

In [4]:
#Set legacy timeParserPolicy
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

#Define the schema for the csv file 
schema = StructType().add("_c0", StringType(), True).add("_c1", StringType(), True).add("_c2", StringType(), True).add("_c3", StringType(), True).add("_c4", StringType(), True).add("_c5", StringType(), True)

#Read the CSV into a DataFrame called df
df = spark.read.format("csv").option("header", False).schema(schema).load("file:///home/jovyan/Diana/ProjectTweets.csv")

#Rename the headers
df = df.withColumnRenamed("_c0", "PRIMARY KEY").withColumnRenamed("_c1", "ID").withColumnRenamed("_c2", "date").withColumnRenamed("_c3", "flag").withColumnRenamed("_c4", "user").withColumnRenamed("_c5", "text")

#Convert string date to TimestampType
df = df.withColumn("date", to_timestamp(df["date"], "EEE MMM dd HH:mm:ss zzzz yyyy"))

#Print schema
df.printSchema()

#Show DataFrame
df.show(5)


root
 |-- PRIMARY KEY: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)

+-----------+----------+-------------------+--------+---------------+--------------------+
|PRIMARY KEY|        ID|               date|    flag|           user|                text|
+-----------+----------+-------------------+--------+---------------+--------------------+
|          0|1467810369|2009-04-07 05:19:45|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|          1|1467810672|2009-04-07 05:19:49|NO_QUERY|  scotthamilton|is upset that he ...|
|          2|1467810917|2009-04-07 05:19:53|NO_QUERY|       mattycus|@Kenichan I dived...|
|          3|1467811184|2009-04-07 05:19:57|NO_QUERY|        ElleCTF|my whole body fee...|
|          4|1467811193|2009-04-07 05:19:57|NO_QUERY|         Karoli|@nationwideclass ...|
+-----------+----------+------------------

## Write data from spark to MongoDB

In [5]:
#Write data into MongoDB
df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").option("uri", uri).save()

# Step Two: Read the Project Tweets data from MongoDB using Spark

In [6]:
#Read Data from MongoDB
from_mongo = spark.read.format('com.mongodb.spark.sql.DefaultSource').load()
print((from_mongo.count(), len(from_mongo.columns)))
from_mongo.printSchema()
from_mongo.show(5)

(1600000, 7)
root
 |-- ID: string (nullable = true)
 |-- PRIMARY KEY: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)

+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+
|        ID|PRIMARY KEY|                 _id|               date|    flag|                text|         user|
+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+
|1551363506|     816210|{66367bb0e048fa3c...|2009-04-18 15:51:40|NO_QUERY|@ctribe I hope yo...|prosario_2000|
|2059493951|     408810|{66367bb0e048fa3c...|2009-06-07 00:02:45|NO_QUERY|Kinda scared to s...|        l7l7v|
|1990436550|    1223636|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|@karinhoegh  Didn...|         kmdk|
|1990436582|    122363

# EXPLORATORY DATA ANALYSIS

# Checking for Duplicates (based on ID, user and text) and Missing data

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum  # Import the 'col' and 'sum' functions

#Initialize Spark Session
spark = SparkSession.builder.appName("Duplicate and Missing Data Check").getOrCreate()

#Define the columns to check for duplicates and missing values
columns_to_check = ['PRIMARY KEY','ID', 'user', 'text']

#Create a pipeline to check for duplicates and missing values
pipeline_df = from_mongo

#Step 1: Remove duplicate records based on specified columns
pipeline_df = pipeline_df.dropDuplicates(subset=columns_to_check)

#Step 2: Check for missing values
missing_counts = pipeline_df.select([col(c).isNull().cast("int").alias(c) for c in columns_to_check]).agg(*[sum(c).alias(c) for c in columns_to_check]).collect()[0]

#Print the results
print("Number of duplicate records removed:", df.count() - pipeline_df.count())

print("Missing value counts:")
for col_name, missing_count in zip(columns_to_check, missing_counts):
    print(col_name, missing_count)

Number of duplicate records removed: 0
Missing value counts:
PRIMARY KEY 0
ID 0
user 0
text 0


In [8]:
#Checking for duplicates in ID and user name
#Count the number of rows before removing duplicates
count_before = from_mongo.count()

#Remove duplicates
df_no_duplicates = from_mongo.dropDuplicates()

#Count the number of rows after removing duplicates
count_after = df_no_duplicates.count()

#Calculate the number of duplicates
num_duplicates = count_before - count_after

print(f"Number of duplicate rows removed: {num_duplicates}")

Number of duplicate rows removed: 0


In [9]:
#Count the number of rows before removing duplicates
count_before = from_mongo.count()

#Remove duplicates based on a specific column
df_no_duplicates = from_mongo.dropDuplicates(subset=['user'])

#Count the number of rows after removing duplicates
count_after = df_no_duplicates.count()

#Calculate the number of duplicates
num_duplicates = count_before - count_after

print(f"Number of duplicate rows removed based on user: {num_duplicates}")

Number of duplicate rows removed based on user: 940225


# Summary Statistics

In [10]:
#Summary Statistics
from_mongo.describe().show()

+-------+--------------------+------------------+--------+--------------------+--------------------+
|summary|                  ID|       PRIMARY KEY|    flag|                text|                user|
+-------+--------------------+------------------+--------+--------------------+--------------------+
|  count|             1600000|           1600000| 1600000|             1600000|             1600000|
|   mean|1.9988175522956276E9|          799999.5|    NULL|                NULL| 4.325887521835714E9|
| stddev|1.9357607362267783E8|461880.35968924506|    NULL|                NULL|5.162733218454889E10|
|    min|          1467810369|                 0|NO_QUERY|                 ...|        000catnap000|
|    max|          2329205794|            999999|NO_QUERY|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|          zzzzeus111|
+-------+--------------------+------------------+--------+--------------------+--------------------+



# EXTRACTING TIME COMPONENTS

In [11]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second


#Initialize Spark Session
spark = SparkSession.builder.appName("DateTime Visualization").getOrCreate()

#Extract relevant time components including hours, minutes, and seconds
df = from_mongo.withColumn("year", year("date")).withColumn("month", month("date")).withColumn("day", dayofmonth("date")).withColumn("hour", hour("date")).withColumn("minute", minute("date")).withColumn("second", second("date"))

#Aggregate data
time_series_data = df.groupBy("date","year", "month", "day", "hour", "minute", "second").count().orderBy("year", "month", "day", "hour", "minute", "second")

In [12]:
#View the df DataFrame after extracting time components
df.show(5)

+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|        ID|PRIMARY KEY|                 _id|               date|    flag|                text|         user|year|month|day|hour|minute|second|
+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|1551363506|     816210|{66367bb0e048fa3c...|2009-04-18 15:51:40|NO_QUERY|@ctribe I hope yo...|prosario_2000|2009|    4| 18|  15|    51|    40|
|2059493951|     408810|{66367bb0e048fa3c...|2009-06-07 00:02:45|NO_QUERY|Kinda scared to s...|        l7l7v|2009|    6|  7|   0|     2|    45|
|1990436550|    1223636|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|@karinhoegh  Didn...|         kmdk|2009|    6|  1|  11|    52|     3|
|1990436582|    1223637|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|Need more FPS.......| jflinchbaugh|2009|    6|  1|  11|    52|

In [13]:
#View the Spark DataFrame Features
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- PRIMARY KEY: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)



In [14]:
from pyspark.sql.functions import count

#Group by the year variable and count the occurrences
year_counts = df.groupBy("year").agg(count("*").alias("count")).orderBy("year")

#Show the tabulated counts
year_counts.show()

+----+-------+
|year|  count|
+----+-------+
|2009|1600000|
+----+-------+



In [15]:
from pyspark.sql.functions import count

#Group by the month variable and count the occurrences
month_counts = df.groupBy("month").agg(count("*").alias("count")).orderBy("month")

#Show the tabulated counts
month_counts.show()

+-----+------+
|month| count|
+-----+------+
|    4|100025|
|    5|559073|
|    6|940902|
+-----+------+



In [16]:
from pyspark.sql.functions import count

#Group by the day variable and count the occurrences
day_counts = df.groupBy("day").agg(count("*").alias("count")).orderBy("day")

#Show the tabulated counts
day_counts.show()

+---+------+
|day| count|
+---+------+
|  1| 95449|
|  2|108872|
|  3| 86707|
|  4| 32938|
|  5| 34735|
|  6|104793|
|  7|132564|
|  8| 18566|
| 10| 31551|
| 11|  6217|
| 12|  4186|
| 14| 22026|
| 15| 83309|
| 16| 87524|
| 17| 85236|
| 18|105040|
| 19| 75612|
| 20| 64029|
| 21| 41782|
| 22| 49519|
+---+------+
only showing top 20 rows



In [17]:
from pyspark.sql.functions import count

# Group by the day variable and count the occurrences
day_counts = df.groupBy("month","day").agg(count("*").alias("count")).orderBy("month")

# Show the tabulated counts
day_counts.show()

+-----+---+------+
|month|day| count|
+-----+---+------+
|    4|  7| 20671|
|    4| 18| 17154|
|    4| 20| 18447|
|    4| 21| 11105|
|    4| 19| 32648|
|    5|  4| 28300|
|    5| 10| 31551|
|    5|  2| 31096|
|    5|  3| 26568|
|    5| 22| 41206|
|    5| 24|   169|
|    5| 17| 41205|
|    5| 12|  4186|
|    5| 29| 60227|
|    5| 18| 44564|
|    5| 27| 11619|
|    5| 11|  6217|
|    5| 14| 21526|
|    5| 25|   169|
|    5| 30|104484|
+-----+---+------+
only showing top 20 rows



In [18]:
from pyspark.sql.functions import count

# Group by the day variable and count the occurrences
ID_counts = df.groupBy("ID").agg(count("*").alias("count")).orderBy("ID")

# Show the tabulated counts
ID_counts.show()

+----------+-----+
|        ID|count|
+----------+-----+
|1467810369|    1|
|1467810672|    1|
|1467810917|    1|
|1467811184|    1|
|1467811193|    1|
|1467811372|    1|
|1467811592|    1|
|1467811594|    1|
|1467811795|    1|
|1467812025|    1|
|1467812416|    1|
|1467812579|    1|
|1467812723|    1|
|1467812771|    1|
|1467812784|    1|
|1467812799|    1|
|1467812964|    1|
|1467813137|    1|
|1467813579|    1|
|1467813782|    1|
+----------+-----+
only showing top 20 rows



In [19]:
time_series_data.show(5)

+-------------------+----+-----+---+----+------+------+-----+
|               date|year|month|day|hour|minute|second|count|
+-------------------+----+-----+---+----+------+------+-----+
|2009-04-07 05:19:45|2009|    4|  7|   5|    19|    45|    1|
|2009-04-07 05:19:49|2009|    4|  7|   5|    19|    49|    1|
|2009-04-07 05:19:53|2009|    4|  7|   5|    19|    53|    1|
|2009-04-07 05:19:57|2009|    4|  7|   5|    19|    57|    2|
|2009-04-07 05:20:00|2009|    4|  7|   5|    20|     0|    1|
+-------------------+----+-----+---+----+------+------+-----+
only showing top 5 rows



In [20]:
type(time_series_data)

pyspark.sql.dataframe.DataFrame

In [21]:
df.show(5)

+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|        ID|PRIMARY KEY|                 _id|               date|    flag|                text|         user|year|month|day|hour|minute|second|
+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|1551363506|     816210|{66367bb0e048fa3c...|2009-04-18 15:51:40|NO_QUERY|@ctribe I hope yo...|prosario_2000|2009|    4| 18|  15|    51|    40|
|2059493951|     408810|{66367bb0e048fa3c...|2009-06-07 00:02:45|NO_QUERY|Kinda scared to s...|        l7l7v|2009|    6|  7|   0|     2|    45|
|1990436550|    1223636|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|@karinhoegh  Didn...|         kmdk|2009|    6|  1|  11|    52|     3|
|1990436582|    1223637|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|Need more FPS.......| jflinchbaugh|2009|    6|  1|  11|    52|

In [22]:
type(df)

pyspark.sql.dataframe.DataFrame

In [23]:
df.show(5)

+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|        ID|PRIMARY KEY|                 _id|               date|    flag|                text|         user|year|month|day|hour|minute|second|
+----------+-----------+--------------------+-------------------+--------+--------------------+-------------+----+-----+---+----+------+------+
|1551363506|     816210|{66367bb0e048fa3c...|2009-04-18 15:51:40|NO_QUERY|@ctribe I hope yo...|prosario_2000|2009|    4| 18|  15|    51|    40|
|2059493951|     408810|{66367bb0e048fa3c...|2009-06-07 00:02:45|NO_QUERY|Kinda scared to s...|        l7l7v|2009|    6|  7|   0|     2|    45|
|1990436550|    1223636|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|@karinhoegh  Didn...|         kmdk|2009|    6|  1|  11|    52|     3|
|1990436582|    1223637|{66367bb0e048fa3c...|2009-06-01 11:52:03|NO_QUERY|Need more FPS.......| jflinchbaugh|2009|    6|  1|  11|    52|

In [24]:
df = df.drop("_id", "ID", "PRIMARY KEY")

In [25]:
df.show()

+-------------------+--------+--------------------+---------------+----+-----+---+----+------+------+
|               date|    flag|                text|           user|year|month|day|hour|minute|second|
+-------------------+--------+--------------------+---------------+----+-----+---+----+------+------+
|2009-04-18 15:51:40|NO_QUERY|@ctribe I hope yo...|  prosario_2000|2009|    4| 18|  15|    51|    40|
|2009-06-07 00:02:45|NO_QUERY|Kinda scared to s...|          l7l7v|2009|    6|  7|   0|     2|    45|
|2009-06-01 11:52:03|NO_QUERY|@karinhoegh  Didn...|           kmdk|2009|    6|  1|  11|    52|     3|
|2009-06-01 11:52:03|NO_QUERY|Need more FPS.......|   jflinchbaugh|2009|    6|  1|  11|    52|     3|
|2009-06-01 11:52:04|NO_QUERY|@SteveOGallagher ...|      kittaykat|2009|    6|  1|  11|    52|     4|
|2009-04-18 15:51:39|NO_QUERY|@Boy_Kill_Boy Nop...|Chelsea_Volturi|2009|    4| 18|  15|    51|    39|
|2009-06-01 11:52:04|NO_QUERY|Can't wait for th...|          Mm_Ka|2009|    6|  1|

## Save the df Dataframe as a Pandas

In [26]:
data = df.collect()

In [27]:
#import pandas as pd
#Create pandas DataFrame from the list of rows
pandas_df = pd.DataFrame(data)

In [28]:
pandas_df.columns = df.columns 

In [29]:
pandas_df.shape

(1600000, 10)

In [31]:
#file path where you want to save the CSV file
file_path = "pandas_data.csv"

# Save the DataFrame to a CSV file
pandas_df.to_csv(file_path)

# Save the processed Data to MongoDB

In [32]:
#Write data into MongoDB
df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").option("uri", uri).save()

## SENTIMENT ANALYSIS COMPARING VADER VS TEXTBLOB, THEN TIME SERIES FORECASTING

## Text/Tweets Processing
This includes the following steps:-
- Read and Load the Dataset
- Exploratory Data Analysis
- Data Visualization of Target Variables
- Data Preprocessing
- Splitting our data into Train and Test sets.
- Transforming Dataset using TF-IDF Vectorizer
- Function for Model Evaluation
- Model Building
- Model Evaluation

# Read the  pandas_data csv

In [None]:
import pandas as pd

#Load the data
df = pd.read_csv("C:/Users/Diana/Documents/Semester 2/sem two ca 2/pandas_data.csv")

## Exploratory Data Analysis

This process involves:-

a) View the first and last few observations of the df dataframe

b) View the number of observations and variables the df dataframe has

c) View the entire df dataframe to check the data types and any missing data in a particluar variable.
 
d) Checking for missing data/dates


In [None]:
#View the first few observations of the df DataFrame
df.head(5)

In [None]:
#View the last few observations of the df DataFrame
df.tail(5)

In [None]:
#View the shape of the df DataFrame
df.shape

In [None]:
#View the features
df.info()

In [None]:
# Convert the 'date_column' to datetime datatype
df['date'] = pd.to_datetime(df['date'])

In [None]:
#Check if the date has been changed to date time
df.info()

In [None]:
#Check the df
df.head(5)

## Drop Variables that will not be used

In [None]:
# Drop all columns except 'date' and 'text'
df = df[['date', 'text']]

In [None]:
#Check if they have been dropped
df.head(5)

In [None]:
# Sort the DataFrame by the 'date' column in ascending order
df = df.sort_values(by='date', ascending=True)

In [None]:
#Check if the date is sorted
df.head(5)

In [None]:
df.info()

In [None]:
# Convert 'date' column to datetime format if it's not already in datetime format
df['date'] = pd.to_datetime(df['date'])

# Extract month from 'date' column
df['month'] = df['date'].dt.to_period('M')

#Group by month and find minimum and maximum date for each month
monthly_date_range = df.groupby('month')['date'].agg([min, max])

#Display the result
print(monthly_date_range)

## The dates have some missing dates
- The data is for 3 months, April, May, June
- April (7/4/2009 - 21/4/2009)
- May (2/5/2009) - 31/5/2009
- June (1/6/2009 -25/6/2009)

In [None]:
# Count the number of dates by month
monthly_date_counts = df.groupby('month').size()

# Display the result
print(monthly_date_counts)


In [None]:
df['day'] = df['date'].dt.to_period('D')

# Count the number of dates by month
dm_date_counts = df.groupby(['month', 'day']).size()

# Display the result
print(dm_date_counts)


There are missing dates in each month

In [None]:
# Count the number of unique dates
unique_date_counts = df['date'].nunique()

# Display the result
print("Number of unique dates:", unique_date_counts)



In [None]:
#View the df
df.head(5)

# EDA of the Texts

In [None]:
#Count the number of words in the text
df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))

In [None]:
df[["text","word_count"]].head(5)

In [None]:
#Find the maximum number of words in the 'word_count' variable of the 'df' DataFrame
largest_word_count = df["word_count"].max()
largest_word_count

In [None]:
#Count the number of characters in the text variable
df['char_count'] = df['text'].str.len() 

In [None]:
#View the head of the char_count and text
df[["text","char_count"]].head(5)

In [None]:
#Find the maximum number of characters in the 'char_count' variable of the 'df' DataFrame
largest_char_count = df["char_count"].max()
largest_char_count

In [None]:
#define a function to calculate the average length of words in a sentence
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [None]:
#Apply the avg_word function on text
df['avg_word'] = df['text'].apply(lambda x: avg_word(x))

In [None]:
#View text and avg_word
df[['text','avg_word']].head()

In [None]:
#Find the highest average word count of the 'df' DataFrame
highest_avg_word = df["avg_word"].max()
highest_avg_word

In [None]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
#count number of stopwords in each text and store in a variable called stopwords
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['text','stopwords']].head(5)

In [None]:
#Find the maximum number of stopwords in the 'df' DataFrame
maximum_no_stopwords = df["stopwords"].max()
maximum_no_stopwords

In [None]:
#Count number of hashtags in each text and store in a variable called hastags
df['hashtags'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['text','hashtags']].head(5)

In [None]:
#Find the maximum number of hashtags in the 'df' DataFrame
maximum_no_hashtags = df["hashtags"].max()
maximum_no_hashtags

In [None]:
#count number of @ signs in text variable and store the value in a variable called at_sign
df['at_sign'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
df[['text','at_sign']].head(5)

In [None]:
#Find the maximum number of at_sign in the 'df' DataFrame
maximum_no_atsign = df["at_sign"].max()
maximum_no_atsign

In [None]:
#count number of numerics in the text variable and store the value in a variable called numeric
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['text','numerics']].head(5)

In [None]:
#Find the maximum number of numerics in the 'df' DataFrame
maximum_no_numerics = df["numerics"].max()
maximum_no_numerics

In [None]:
#count number of uppercases in the text variable and store the value in a variable called upper
df['upper'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['text','upper']].head(5)

In [None]:
#Find the maximum number of uppercases in the 'df' DataFrame
maximum_no_uppercases = df["upper"].max()
maximum_no_uppercases

## Tweets Processing
Tweet/text processing involves:-

a) Text Normalization 
- Remove special characters
- Change the upper cases to lower cases
- Remove numbers/integers
- Remove punctuations
- Remove white space
- Remove URLS/links

b) Tokenization
- Tokenization

c) Remove stopwords

d) Stemming/ lemmatization

e

## Text Normalization

### Remove user name from text @username

In [None]:
import re

#Define a function to remove user names from text
def remove_usernames(text):
    return re.sub(r'@\w+', '', text)

#Apply the function to the 'text' variable
df['text1'] = df['text'].apply(remove_usernames)

# Print the first few rows of the DataFrame with user names removed from text
print(df['text1'].head())


### Remove url/www.links

In [None]:
import re

def remove_urls(df):
    #Define regex pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    
    #Apply regex substitution to each row of the 'text' column
    df['text1'] = df['text1'].apply(lambda text: re.sub(url_pattern, '', text))
    
    return df

# remove a url
df = remove_urls(df)


In [None]:
df.head(5)

### Remove all special characters

In [None]:
#Remove all special characters
df['text1'] = df['text1'].str.replace('[^\w\s]','')
df['text1'].head(5)

### Remove punctuations

In [None]:
#Remove punctuations
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

#Apply remove_punctuation function to the 'text' column
df['text1'] = df['text1'].apply(remove_punctuation)


In [None]:
df.head(5)

### Convert all upper cases to lower

In [None]:
#Convert all uppercases to lower cases
df['text1'] = df['text1'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['text1'].head(5)

## Tokenization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

#Tokenize the text using NLTK's word tokenizer
df['tokenized_text'] = df['text1'].apply(lambda x: ' '.join(word_tokenize(x.lower())))

#Print the first few rows of the DataFrame with tokenized text
df[['text1', 'tokenized_text']].head(5)

## Remove stopwords

In [None]:
from nltk.corpus import stopwords

#Download NLTK resources for the first time
nltk.download('stopwords')

#Get the English stopwords list
stop_words = set(stopwords.words('english'))

#Function to remove stopwords
def remove_stopwords(tokenized_text):
    return ' '.join([word for word in tokenized_text.split() if word.lower() not in stop_words])

#Remove stopwords from the tokenized text column
df['tokenized_text'] = df['tokenized_text'].apply(remove_stopwords)

# Print the first few rows of the DataFrame with stopwords removed
print(df[['text1', 'tokenized_text']].head())

In [None]:
#Create a Frequency Distribution of the tokenized words
#Import Frequency Distribution
from nltk.probability import FreqDist

#Find frequency distribution of tokenized_text
fdist = FreqDist(df['tokenized_text'])

In [None]:
#Check top 5 common words
fdist.most_common(5)

In [None]:
#Create a frequency distribution plot
import matplotlib.pyplot as plt

#Plot Frequency Distribution
fdist.plot(20,cumulative=False)
plt.show()

## Stemming/ lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(tokenized_text):
    #Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokenized_text.split()]
    
    #Join the lemmatized tokens back into a string
    lemmatized_text = ' '.join(lemmatized_tokens)
    
    return lemmatized_text

#Apply lemmatization to 'tokenized_text' column in DataFrame df
df['preprocessed_text'] = df['tokenized_text'].apply(lemmatize_text)

#Print the first few rows of the DataFrame with lemmatized text
print(df[['tokenized_text', 'preprocessed_text']].head(5))

# SENTIMENT ANALYSIS
There are various ways to perform sentiment analysis. These include:-
- Using Text Blob
- Using Vader
- Using Bag of Words Vectorization-based Models
- Using LSTM-based Models
- Using Transformer-based Models

## USING TEXTBLOB
-  It takes text as an input and can return polarity and subjectivity as outputs.

- Polarity determines the sentiment of the text. Its values lie in [-1,1] where -1 denotes a highly negative sentiment and 1 denotes a highly positive sentiment.

- Subjectivity determines whether a text input is factual information or a personal opinion. Its value lies between [0,1] where a value closer to 0 denotes a piece of factual information and a value closer to 1 denotes a personal opinion.

In [None]:
from textblob import TextBlob

#Apply sentiment analysis using TextBlob to the 'text' column and storing the polarity score
df['polarity_score'] = df['preprocessed_text'].apply(lambda x: TextBlob(x).sentiment[0] )
df[['preprocessed_text','polarity_score']].head(5)

In [None]:
#View the df, first 5 observations
df.head(5)

In [None]:
#View data types of the variables
df.info()

In [None]:
# Function to categorize sentiment polarities
def categorize_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to classify polarity scores and store the result in 'textblob_sentiment' column
df['textblob_sentiment'] = df['polarity_score'].apply(categorize_sentiment)

# Print the first few rows of the DataFrame with 'textblob_sentiment' column
print(df[['preprocessed_text', 'polarity_score', 'textblob_sentiment']].head())

In [None]:
# Function to categorize sentiment polarities
# Categorize sentiment polarities
def categorize_sentiment(polarity):
    if polarity > 0:
        return 1  # Positive
    elif polarity < 0:
        return -1  # Negative
    else:
        return 0  # Neutral

# Apply the function to classify polarity scores and store the result in 'textblob_sentiment' column
df['textblobsentiment'] = df['polarity_score'].apply(categorize_sentiment)

#View the first few rows of the DataFrame with 'textblob_sentiment' column
df[['preprocessed_text', 'polarity_score', 'textblob_sentiment','textblobsentiment']].head(5)

# USING VADER SENTIMENT ANALYSIS

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

In [None]:
#Create a VADER SentimentIntensityAnalyzer instance
twitter_sentiment = SentimentIntensityAnalyzer()

#Create a function to get compound sentiment score using VADER
def get_sentiment_scores(text):
    return twitter_sentiment.polarity_scores(text)

#Apply the function to get the compound scores and store the values in vader_sentiment
df['vader_sentiment'] = df['preprocessed_text'].apply(get_sentiment_scores)

#Extract the 'neg', 'neu', 'pos', 'compound' scores from 'vader_sentiment'
df['vader_neg'] = df['vader_sentiment'].apply(lambda x: x['neg'])
df['vader_neu'] = df['vader_sentiment'].apply(lambda x: x['neu'])
df['vader_pos'] = df['vader_sentiment'].apply(lambda x: x['pos'])
df['vader_compound'] = df['vader_sentiment'].apply(lambda x: x['compound'])

#Create a function to categorize sentiments based on compound scores
def categorize_sentiment(vader_compound):
    if vader_compound >= 0.05:
        return 'positive'
    elif vader_compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

#Apply the categorization function to create the 'sentiment' column
df['sentiment_vader'] = df['vader_compound'].apply(categorize_sentiment)

#Display the updated DataFrame with the 'sentiment' column
print(df[['preprocessed_text','vader_compound','sentiment_vader']])

In [None]:
#View the first 5 observations
df.head(5)

# Determine which one to use between vader and textblob by evaluating their performance using 

# Using TfidfVectorizer, countvectorizer AND MNB 

## Textblob sentiments

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#Vectorize the Text Data using TF-IDF
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                        stop_words='english', ngram_range=(1,1))
X = tfidf.fit_transform(df['text'])
y = df['textblob_sentiment']

#Split the data into Train-Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a classifier for MNB
classifier = MultinomialNB()

#Train the Classifier
classifier.fit(X_train, y_train)

#Evaluate the Model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

## vader sentiment

In [None]:
#Vectorize the Text Data using TF-IDF
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                        stop_words='english', ngram_range=(1,1))
X = tfidf.fit_transform(df['text'])
y = df['sentiment_vader']

#Split the data into Train-Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a classifier for MNB
classifier = MultinomialNB()

#Train the Classifier
classifier.fit(X_train, y_train)

#Evaluate the Model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

## USING COUNTVECTORIZER

In [None]:
#Pre-Processing using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv    = CountVectorizer(stop_words = 'english',ngram_range = (1, 1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['text'])

text_counts

## textblob sentiments using count vectorizer

In [None]:
#Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, df['textblob_sentiment'], test_size=0.2, random_state=42)

#Training the model
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

#Caluclate the accuracy score of the model
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracuy Score: ",accuracy_score)

## vader sentiments using count vectorizer

In [None]:
#Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, df['sentiment_vader'], test_size=0.25, random_state=5)

#Training the model
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

#Caluclate the accuracy score of the model
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracuy Score: ",accuracy_score)

In [None]:
#Check the variables data type
df.info()

Since the accuracy of sentiments extracted by textblob is higher sentiments extracted by textblob will be used

## Encode the Sentiments
Try 
label and one hot and see which one performs better

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit and transform the 'sentiment' column
df['label_encoded'] = label_encoder.fit_transform(df['textblob_sentiment'])

## One-hot encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder object
onehot_encoder = OneHotEncoder()

# Fit and transform the 'sentiment' column
sentiment_onehot = onehot_encoder.fit_transform(df['textblob_sentiment'].values.reshape(-1, 1))

# Convert the one-hot encoded result to a DataFrame
sentiment_onehot_df = pd.DataFrame(sentiment_onehot.toarray(), columns=onehot_encoder.categories_[0])

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df = pd.concat([df, sentiment_onehot_df], axis=1)


In [None]:
df.info()

In [126]:
# Drop all columns that are unused
df2 = df[['preprocessed_text','sentiment_encoded','textblob_sentiment','sentiment_onehot_df']]

# Display the resulting DataFrame
df2.head(5)

Unnamed: 0_level_0,preprocessed_text,sentiment_encoded,textblob_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-04-07 05:19:45,awww thats bummer shoulda got david carr third...,2,Positive
2009-04-07 05:19:49,upset cant update facebook texting might cry r...,1,Neutral
2009-04-07 05:19:53,dived many time ball managed save 50 rest go b...,2,Positive
2009-04-07 05:19:57,whole body feel itchy like fire,2,Positive
2009-04-07 05:19:57,behaving im mad cant see,0,Negative


In [127]:
#Vie the data info
df2.head(5)

Unnamed: 0_level_0,preprocessed_text,sentiment_encoded,textblob_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-04-07 05:19:45,awww thats bummer shoulda got david carr third...,2,Positive
2009-04-07 05:19:49,upset cant update facebook texting might cry r...,1,Neutral
2009-04-07 05:19:53,dived many time ball managed save 50 rest go b...,2,Positive
2009-04-07 05:19:57,whole body feel itchy like fire,2,Positive
2009-04-07 05:19:57,behaving im mad cant see,0,Negative


In [128]:
#Check the df2 info
df2.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1600000 entries, 2009-04-07 05:19:45 to 2009-06-25 17:28:31
Data columns (total 3 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   preprocessed_text   1600000 non-null  object
 1   sentiment_encoded   1600000 non-null  int32 
 2   textblob_sentiment  1600000 non-null  object
dtypes: int32(1), object(2)
memory usage: 75.0+ MB


In [129]:
df2.head(5)

Unnamed: 0_level_0,preprocessed_text,sentiment_encoded,textblob_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-04-07 05:19:45,awww thats bummer shoulda got david carr third...,2,Positive
2009-04-07 05:19:49,upset cant update facebook texting might cry r...,1,Neutral
2009-04-07 05:19:53,dived many time ball managed save 50 rest go b...,2,Positive
2009-04-07 05:19:57,whole body feel itchy like fire,2,Positive
2009-04-07 05:19:57,behaving im mad cant see,0,Negative


In [130]:
df2.describe()

Unnamed: 0,sentiment_encoded
count,1600000.0
mean,1.220185
std,0.7618375
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,2.0


## Class imbalance

In [131]:
#Count occurrences of each sentiment class
sentiment_counts = df2['textblob_sentiment'].value_counts()

# Display the result as a table
print("Sentiment Counts:")
print(sentiment_counts)


Sentiment Counts:
Positive    679250
Neutral     593796
Negative    326954
Name: textblob_sentiment, dtype: int64


There is class imbalance, in the sentiment data. Class imbalance occurs when one class (or classes) has significantly more samples than the other classes. In this case, the counts of sentiment classes are as follows:


Class 2: 679,250 samples

Class 0: 593,796 samples

Class 1: 326,954 samples
    
The class imbalance can potentially affect the performance of the machine learning model, especially if the minority class (in this case, class 1) 

To address class imbalance here are some techniques to be considered:-

- Resampling: Either oversampling the minority class (creating more samples of the minority class) or undersampling the majority class (removing samples from the majority class).

- Class weights: Assigning higher weights to the minority class during model training to give it more importance.

- Synthetic data generation: Generating synthetic samples for the minority class using techniques like SMOTE (Synthetic Minority Over-sampling Technique).

- Different algorithms: Using algorithms that are less sensitive to class imbalance, such as decision trees or random forests.

    


I will explore two options
- Resampling
- Class weights

## Resampling
Resampling is a technique used to address class imbalance by either oversampling the minority class (creating more samples of the minority class) or undersampling the majority class (removing samples from the majority class). 

# Oversampling the minority class (i.e Sentiment = class -1)

# LSTM for sentiment analysis
The steps to be considered include:-

- Preprocess the text data- i am using already preprocessed text data
- Tokenize the preprocessed data
- Prepare data for training
- split the data into training and test set
- define and train the LSTM model
- Evaluate the model

In [132]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split

In [133]:
#tokenize the preprocessed text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df2['preprocessed_text'])
sequences = tokenizer.texts_to_sequences(df2['preprocessed_text'])

In [134]:
#Find the maximum sequence length
max_sequence_length = max(len(seq) for seq in sequences)

print("Maximum sequence length:", max_sequence_length)

Maximum sequence length: 37


In [135]:
#Prepare the data for training
max_len = 37  # Max sequence length
X = pad_sequences(sequences, maxlen=max_len)
y = df2['sentiment_encoded'].values

In [136]:
print(X)

[[   0    0    0 ... 9052 1724    2]
 [   0    0    0 ...   12  179 1069]
 [   0    0    0 ...  369    6 2984]
 ...
 [   0    0    0 ...  954 2414   49]
 [   0    0    0 ... 1359   36   66]
 [   0    0    0 ...   99  104 1321]]


In [137]:
X.shape

(1600000, 37)

In [138]:
print(y)

[2 1 2 ... 0 2 2]


In [139]:
y.shape

(1600000,)

In [140]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((1280000, 37), (320000, 37), (1280000,), (320000,))

In [141]:
print("X_train dtype:", X_train.dtype)
print("y_train dtype:", y_train.dtype)

X_train dtype: int32
y_train dtype: int32


In [142]:
#Define and train the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # Output layer with 3 units for 3 sentiment classes

In [None]:
#Model compilation and training
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10

In [80]:
#Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


MemoryError: Unable to allocate 17.9 GiB for an array with shape (1600000, 3000) and data type int32

In [None]:
#Check shape of train and test df
train.shape, test.shape

# LONG-SHORT-TERM MEMORY (LSTM)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from keras.models import Sequential
from keras.layers import LSTM, Dense

## Drop unused variables

# LSTM (Long Short Term Memory)

# ONE-HOT ENCODING

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# One-hot encode the sentiment_vader variable
onehot_encoder = OneHotEncoder(sparse=False)
sentiment_encoded = onehot_encoder.fit_transform(df[['sentiment_vader']])
vader_df = pd.concat([df[['date']], pd.DataFrame(sentiment_encoded, columns=onehot_encoder.categories_[0])], axis=1)

In [None]:
vader_df.head(2)

# Split the data into training and test set

In [94]:
train.shape, test.shape

((1280000, 4), (320000, 4))

In [96]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

In [97]:
#Reshape into X=t and Y=t+1
look_back = 1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

InvalidIndexError: (slice(0, 1, None), 0)

# Implement LSTM

In [None]:
#Implement LSTM model
def create_dataset(X, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(X[i + time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 10
X_train, y_train = create_dataset(train_sentiment_scaled, TIME_STEPS)
X_test, y_test = create_dataset(test_sentiment_scaled, TIME_STEPS)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model_lstm.add(LSTM(units=50))
model_lstm.add(Dense(units=1))

model_lstm.compile(optimizer='adam', loss='mean_squared_error')
model_lstm.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

lstm_predictions_scaled = model_lstm.predict(X_test)
lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)

# Evaluate the models
arima_rmse = np.sqrt(mean_squared_error(test_sentiment, arima_predictions))
lstm_rmse = np.sqrt(mean_squared_error(test_sentiment, lstm_predictions))

print("ARIMA RMSE:", arima_rmse)
print("LSTM RMSE:", lstm_rmse)

# Make predictions for 1 day, 3 days, and 7 days ahead
def forecast_sentiment(model, data, steps):
    last_window = data[-TIME_STEPS:]
    forecast = []
    for _ in range(steps):
        prediction = model.predict(last_window.reshape(1, -1, 1))[0][0]
        forecast.append(prediction)
        last_window = np.roll(last_window, -1)
        last_window[-1] = prediction
    return forecast

# Forecast sentiment using ARIMA
arima_forecast_1day = model_arima_fit.forecast(steps=1)[0][0]
arima_forecast_3day = model_arima_fit.forecast(steps=3)[0][-1]
arima_forecast_7day = model_arima_fit.forecast(steps=7)[0][-1]

# Forecast sentiment using LSTM
lstm_forecast_1day = forecast_sentiment(model_lstm, test_sentiment_scaled[-TIME_STEPS:], 1)
lstm_forecast_3day = forecast_sentiment(model_lstm, test_sentiment_scaled[-TIME_STEPS:], 3)[-1]
lstm_forecast_7day = forecast_sentiment(model_lstm, test_sentiment_scaled[-TIME_STEPS:], 7)[-1]

print("ARIMA 1-day forecast:", arima_forecast_1day)
print("ARIMA 3-day forecast:", arima_forecast_3day)
print("ARIMA 7-day forecast:", arima_forecast_7day)
print("LSTM 1-day forecast:", lstm_forecast_1day)
print("LSTM 3-day forecast:", lstm_forecast_3day)
print("LSTM 7-day forecast:", lstm_forecast_7day)


In [None]:
!pip install dash

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.arima.model import ARIMA
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Load the dataset
# Assuming df contains at least two columns: 'date' and 'sentiment'

# Preprocess the data
# Ensure 'date' column is in datetime format

# Implement ARIMA model
def train_arima_model(data):
    model_arima = ARIMA(data, order=(5,1,0))
    model_arima_fit = model_arima.fit(disp=0)
    return model_arima_fit

def forecast_arima(model, steps):
    forecast = model.forecast(steps=steps)[0]
    return forecast

# Implement LSTM model
def create_dataset(X, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(X[i + time_steps])
    return np.array(Xs), np.array(ys)

def train_lstm_model(data, time_steps=10):
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(data)
    X, y = create_dataset(data_scaled, time_steps)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    
    model_lstm = Sequential()
    model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)))
    model_lstm.add(LSTM(units=50))
    model_lstm.add(Dense(units=1))

    model_lstm.compile(optimizer='adam', loss='mean_squared_error')
    model_lstm.fit(X, y, epochs=100, batch_size=32, verbose=0)
    
    return model_lstm, scaler

def forecast_lstm(model, scaler, data, steps, time_steps=10):
    forecast = []
    last_window = data[-time_steps:]
    for _ in range(steps):
        prediction = model.predict(last_window.reshape(1, -1, 1))[0][0]
        forecast.append(prediction)
        last_window = np.roll(last_window, -1)
        last_window[-1] = prediction
    forecast = scaler.inverse_transform(np.array(forecast).reshape(-1, 1))
    return forecast

# Initialize the Dash app
app = dash.Dash(__name__)

# Define the layout of the dashboard
app.layout = html.Div([
    dcc.Graph(id='forecast-graph'),
    html.Label('Select Model:'),
    dcc.Dropdown(
        id='model-dropdown',
        options=[
            {'label': 'ARIMA', 'value': 'arima'},
            {'label': 'LSTM', 'value': 'lstm'}
        ],
        value='arima'
    ),
    html.Label('Select Forecast Period:'),
    dcc.Dropdown(
        id='period-dropdown',
        options=[
            {'label': '1 Day', 'value': 1},
            {'label': '3 Days', 'value': 3},
            {'label': '7 Days', 'value': 7}
        ],
        value=1
    )
])

# Define callback to update the graph based on user input
@app.callback(
    Output('forecast-graph', 'figure'),
    [Input('model-dropdown', 'value'),
     Input('period-dropdown', 'value')]
)
def update_graph(selected_model, forecast_period):
    if selected_model == 'arima':
        model = train_arima_model(df['sentiment'])
        forecast = forecast_arima(model, forecast_period)
    elif selected_model == 'lstm':
        lstm_model, scaler = train_lstm_model(df['sentiment'])
        forecast = forecast_lstm(lstm_model, scaler, df['sentiment'], forecast_period)
    
    # Generate x-axis values (dates)
    dates = pd.date_range(start=df['date'].iloc[-1], periods=forecast_period + 1)[1:]
    
    # Create the plot
    fig = {
        'data': [
            {'x': dates, 'y': forecast, 'type': 'line', 'name': 'Forecast'}
        ],
        'layout': {
            'title': f'{selected_model.upper()} Forecast for {forecast_period} Days',
            'xaxis': {'title': 'Date'},
            'yaxis': {'title': 'Sentiment'}
        }
    }
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


# TIME SERIES FORECASTING