In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## 1) Scraping Data Part

In [2]:
# It requires python 3.8 or higher
!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git

Collecting git+https://github.com/JustAnotherArchivist/snscrape.git
  Cloning https://github.com/JustAnotherArchivist/snscrape.git to c:\users\kirolos\appdata\local\temp\pip-req-build-e_wz1ezi
  Resolved https://github.com/JustAnotherArchivist/snscrape.git to commit d72b51953f0ec05ee18761ea31c1bb82f886f7a9
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: snscrape
  Building wheel for snscrape (setup.py): started
  Building wheel for snscrape (setup.py): finished with status 'done'
  Created wheel for snscrape: filename=snscrape-0.4.3.20220107.dev56+gd72b519-py3-none-any.whl size=68190 sha256=ff63d09143bf7e9e1aef6f7085e0b77f04a9069c9133ea3160e4d3786d0485f0
  Stored in directory: C:\Users\Kirolos\AppData\Local\Temp\pip-ephem-wheel-cache-zjwkjc_s\wheels\05\e9\f7\57056e7c7e44b1feed932fa49fdec9d706c4f563e37160ab74
Successfully built snscrape
Installing collected packages: snscrape
Successfully insta

  Running command git clone --filter=blob:none --quiet https://github.com/JustAnotherArchivist/snscrape.git 'C:\Users\Kirolos\AppData\Local\Temp\pip-req-build-e_wz1ezi'


In [3]:
# It spent 3 hours for scraping more than 180,000 tweets
text_query = "$NFLX"
since_date = "2018-01-01"
until_date = "2022-07-11"
os.system('snscrape --jsonl --since {} twitter-search "{} until:{}"> text-query-tweets.json'.format(since_date, text_query, until_date))

0

### a) Reading the output jason file by pandas as a Dataframe

In [5]:
# Reading jason files as dataframes
tweets_df = pd.read_json('text-query-tweets.json', lines=True)
# tweets_df2 = pd.read_json('text-query-tweets2.json', lines=True)
# Merging both dataframes as a single dataframe
# tweets_df = pd.concat([tweets_df1,tweets_df2],ignore_index=True)
# Selecting the important columns only wich are Data,renderContent and Lang
tweets_content = tweets_df.loc[:,['date','renderedContent','lang']]
# Choosing the tweets in english language only
tweets_content = tweets_content[tweets_content['lang']=='en']
# Dropping the lang column
tweets_content.drop("lang",axis=1,inplace=True)
# Download the CSV file result on the current folder.
tweets_content.to_csv('Ntweets.csv',index=False)

## 2) NLP Part

In [6]:
!pip install demoji
import demoji
import re
import string
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import attr
import nltk



### a) Preprocessing

In [7]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [9]:
# Read the scrapped file
tweets = pd.read_csv("Ntweets.csv")
tweets.head()

Unnamed: 0,date,renderedContent
0,2019-12-31 23:54:41+00:00,Original content rather than licensed titles doninating $NFLX's 2020 viewership (Netflix data taken with a pinch of salt as ever).
1,2019-12-31 23:43:26+00:00,$FB $AMZN $GOOGL $NFLX #FANG 2019 https://t.co/RdVlnrPiR3
2,2019-12-31 23:13:37+00:00,i think many retail folks early next week will look back on this end of year tape paint in markets today as a missed opportunity to sell the longs they were waiting to sell due to 2021 tax deferrance. \n\nWe'll see.\n\n$SPX $SPY $TSLA $NFLX $AAPL $AMD $NVDA $VIX $VXX $VXXB $QQQ $IBB
3,2019-12-31 23:01:00+00:00,#Netflix continues to grow its global subscriber base but it's about to face some stiff competition from #Apple and #Disney. #stocks $NFLX $DIS $AAPL https://t.co/sIJu4fzAl0
4,2019-12-31 22:56:49+00:00,"If only... $NFLX and chill? No, $NFLX &amp; $DPZ. Both started the decade at 7.93 and 8.53 respectively. nearly 4000% gain."


In [10]:
# Applying text cleaning and then downloading it on the current folder
tweets['cleaned'] = tweets["renderedContent"].apply(lambda row:clean_text(row))
tweets.to_csv("CleanedNTweets.csv",index=False)

### b) Sentiment analysis by pretrained model

In [11]:
!pip install transformers
!pip install transformers[sentencepiece]



In [12]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

  example_input = torch.tensor([[-3, -2, -1], [0, 1, 2]])


In [13]:
tweets = pd.read_csv("CleanedNTweets.csv")
tweets.head()

Unnamed: 0,date,renderedContent,cleaned
0,2019-12-31 23:54:41+00:00,Original content rather than licensed titles doninating $NFLX's 2020 viewership (Netflix data taken with a pinch of salt as ever).,original content rather license title doninating nflx's viewership netflix data take pinch salt ever
1,2019-12-31 23:43:26+00:00,$FB $AMZN $GOOGL $NFLX #FANG 2019 https://t.co/RdVlnrPiR3,amzn googl nflx fang
2,2019-12-31 23:13:37+00:00,i think many retail folks early next week will look back on this end of year tape paint in markets today as a missed opportunity to sell the longs they were waiting to sell due to 2021 tax deferrance. \n\nWe'll see.\n\n$SPX $SPY $TSLA $NFLX $AAPL $AMD $NVDA $VIX $VXX $VXXB $QQQ $IBB,think many retail folk early next week look back end year tape paint market today miss opportunity sell longs wait sell due tax deferrance we'll see spx spy tsla nflx aapl amd nvda vix vxx vxxb qqq ibb
3,2019-12-31 23:01:00+00:00,#Netflix continues to grow its global subscriber base but it's about to face some stiff competition from #Apple and #Disney. #stocks $NFLX $DIS $AAPL https://t.co/sIJu4fzAl0,netflix continue grow global subscriber base face stiff competition apple disney stock nflx dis aapl
4,2019-12-31 22:56:49+00:00,"If only... $NFLX and chill? No, $NFLX &amp; $DPZ. Both started the decade at 7.93 and 8.53 respectively. nearly 4000% gain.",nflx chill nflx dpz start decade respectively nearly gain


In [16]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-xlm-roberta-base-sentiment\\tokenizer_config.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment\\special_tokens_map.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment\\sentencepiece.bpe.model',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment\\added_tokens.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment\\tokenizer.json')

In [17]:
def polarity(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "Negative" else 1 if l == "Positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [18]:
# downloading the file after applying sentiment analysis on the current folder
tweets['label'],tweets['Polarity'] = zip(*tweets['cleaned'].apply(lambda txt:polarity(txt)))
tweets.to_csv("polarizedTweets.csv",index=False)

## 3) Preparing Data for time series model

In [19]:
ptweets = pd.read_csv("polarizedTweets.csv")
ptweets.head()

Unnamed: 0,date,renderedContent,cleaned,label,Polarity
0,2019-12-31 23:54:41+00:00,Original content rather than licensed titles doninating $NFLX's 2020 viewership (Netflix data taken with a pinch of salt as ever).,original content rather license title doninating nflx's viewership netflix data take pinch salt ever,Negative,-1
1,2019-12-31 23:43:26+00:00,$FB $AMZN $GOOGL $NFLX #FANG 2019 https://t.co/RdVlnrPiR3,amzn googl nflx fang,Neutral,0
2,2019-12-31 23:13:37+00:00,i think many retail folks early next week will look back on this end of year tape paint in markets today as a missed opportunity to sell the longs they were waiting to sell due to 2021 tax deferrance. \n\nWe'll see.\n\n$SPX $SPY $TSLA $NFLX $AAPL $AMD $NVDA $VIX $VXX $VXXB $QQQ $IBB,think many retail folk early next week look back end year tape paint market today miss opportunity sell longs wait sell due tax deferrance we'll see spx spy tsla nflx aapl amd nvda vix vxx vxxb qqq ibb,Neutral,0
3,2019-12-31 23:01:00+00:00,#Netflix continues to grow its global subscriber base but it's about to face some stiff competition from #Apple and #Disney. #stocks $NFLX $DIS $AAPL https://t.co/sIJu4fzAl0,netflix continue grow global subscriber base face stiff competition apple disney stock nflx dis aapl,Neutral,0
4,2019-12-31 22:56:49+00:00,"If only... $NFLX and chill? No, $NFLX &amp; $DPZ. Both started the decade at 7.93 and 8.53 respectively. nearly 4000% gain.",nflx chill nflx dpz start decade respectively nearly gain,Neutral,0


In [20]:
# Extracting the date and polarized values from the previous dataframe
ptweets_df = ptweets.loc[:,["date","Polarity"]]
ptweets_df.head()

Unnamed: 0,date,Polarity
0,2019-12-31 23:54:41+00:00,-1
1,2019-12-31 23:43:26+00:00,0
2,2019-12-31 23:13:37+00:00,0
3,2019-12-31 23:01:00+00:00,0
4,2019-12-31 22:56:49+00:00,0


In [21]:
# Change the date format to match with the next csv file date format
ptweets_df['date'] =pd.to_datetime(ptweets_df['date'],infer_datetime_format=True)
ptweets_df['date'] =pd.to_datetime(ptweets_df['date'].dt.strftime("%m/%d/%y"))

# Aggregate the tweets polarization by avergae, sum and counts 
Pol_df = pd.DataFrame(ptweets_df.groupby('date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)
Pol_df['P_sum'] = ptweets_df.groupby('date')['Polarity'].sum()
Pol_df['twt_count'] = ptweets_df.groupby('date')['Polarity'].count()
Pol_df.head()

Unnamed: 0_level_0,P_mean,P_sum,twt_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,0.007519,1,133
2018-01-02,0.020833,10,480
2018-01-03,0.071217,24,337
2018-01-04,-0.018519,-4,216
2018-01-05,-0.019737,-6,304


In [23]:
# Reading the netflix finance data and preparing it to fit with the polarized values
nflx_df = pd.read_csv("NFLX.csv")
nflx_df.rename(columns={"Date":"date"},inplace=True)
nflx_df['date'] = pd.to_datetime(nflx_df['date'],infer_datetime_format=True)
nflx_df.set_index("date")
# Adding the polarization column in the netflix dataframe.
final_df = nflx_df.join(Pol_df,on='date',how="inner")
final_df.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2018-01-02,196.100006,201.649994,195.419998,201.070007,201.070007,10966900,0.020833,10,480
1,2018-01-03,202.050003,206.210007,201.5,205.050003,205.050003,8591400,0.071217,24,337
2,2018-01-04,206.199997,207.050003,204.0,205.630005,205.630005,6029600,-0.018519,-4,216
3,2018-01-05,207.25,210.020004,205.589996,209.990005,209.990005,7033200,-0.019737,-6,304
4,2018-01-08,210.020004,212.5,208.440002,212.050003,212.050003,5580200,-0.007663,-2,261


In [24]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("FinalNflx.csv",index=False)

In [29]:
df_2018=pd.read_csv('nflx2018-2020.csv')
df_2020=pd.read_csv('nflx2020-2022.csv')
df_all=pd.concat([df_2018,df_2020])
df_all=df_all.reset_index()
df_all.drop('index',inplace=True,axis=1)
df_all

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2018-01-02,196.100006,201.649994,195.419998,201.070007,201.070007,10966900,0.020833,10,480
1,2018-01-03,202.050003,206.210007,201.500000,205.050003,205.050003,8591400,0.071217,24,337
2,2018-01-04,206.199997,207.050003,204.000000,205.630005,205.630005,6029600,-0.018519,-4,216
3,2018-01-05,207.250000,210.020004,205.589996,209.990005,209.990005,7033200,-0.019737,-6,304
4,2018-01-08,210.020004,212.500000,208.440002,212.050003,212.050003,5580200,-0.007663,-2,261
...,...,...,...,...,...,...,...,...,...,...
1132,2022-07-01,176.490005,180.100006,174.270004,179.949997,179.949997,5194700,-0.062315,-21,337
1133,2022-07-05,176.279999,185.919998,172.679993,185.880005,185.880005,7334300,-0.058824,-25,425
1134,2022-07-06,185.199997,186.220001,180.820007,184.059998,184.059998,5753400,-0.014870,-8,538
1135,2022-07-07,184.270004,190.210007,183.500000,189.270004,189.270004,6334500,-0.055427,-24,433


In [30]:
df_all.to_csv('Final_nflx_data_2018-2022',index=False)

In [11]:
import pandas as pd
sss=pd.read_csv('CleanedNTweets.csv')
sss.tail()

Unnamed: 0,date,renderedContent,cleaned
262788,2018-01-01 00:20:07+00:00,Updates on stocks from specstocks at Speculati...,update stock specstocks speculatingstocks puls...
262789,2018-01-01 00:15:03+00:00,"REAL TIME TRADE ALERTS via PRIVATE $TWTR FEED,...",real time trade alert via private twtr feed pe...
262790,2018-01-01 00:13:43+00:00,$nflx New Chappelle stand-up specials on Netfl...,nflx new chappelle stand-up special netflix today
262791,2018-01-01 00:08:10+00:00,Are You Bullish Or Bearish On #Netflix? Start ...,bullish bearish netflix start trading nflx bit...
262792,2018-01-01 00:00:06+00:00,LEVERAGE VOLATILITY! EXPLORE STRATEGIES IN OUR...,leverage volatility explore strategy research ...


In [13]:
import pandas as pd
sss=pd.read_csv('NFLX.csv')
sss.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-01-02,196.100006,201.649994,195.419998,201.070007,201.070007,10966900
1,2018-01-03,202.050003,206.210007,201.5,205.050003,205.050003,8591400
2,2018-01-04,206.199997,207.050003,204.0,205.630005,205.630005,6029600
3,2018-01-05,207.25,210.020004,205.589996,209.990005,209.990005,7033200
4,2018-01-08,210.020004,212.5,208.440002,212.050003,212.050003,5580200


In [15]:
import pandas as pd
sss=pd.read_csv('Final_nflx_data_2018-2022')
sss.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2018-01-02,196.100006,201.649994,195.419998,201.070007,201.070007,10966900,0.020833,10,480
1,2018-01-03,202.050003,206.210007,201.5,205.050003,205.050003,8591400,0.071217,24,337
2,2018-01-04,206.199997,207.050003,204.0,205.630005,205.630005,6029600,-0.018519,-4,216
3,2018-01-05,207.25,210.020004,205.589996,209.990005,209.990005,7033200,-0.019737,-6,304
4,2018-01-08,210.020004,212.5,208.440002,212.050003,212.050003,5580200,-0.007663,-2,261
