# Algo-trading: Sentimental Analysis Indicator
> Author: **Felipe Dominguez**
>
> Date: **06/17/2021**
>
> **Team 2:**
>Nicola Bini
<br>
>Felipe Dominguez
<br>
>Tri Dung Dinh
<br>
>Manuel Echazarra
<br>
## Summary
This code provides a twitter webscrap for each stock selected for this analysis. We webscrap approximately 30k+ tweets for each stock and then we used the package "TextBlob" to assign a positive, neutral, or negative sentiment to each tweet. Finally, we normalize the sentiment per day.

In [2]:
import pandas as pd
import string
import nltk
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from textblob import TextBlob
import snscrape.modules.twitter as sntwitter

#Creating list to append tweet data to
tweets_list2 = []

# Using TwitterSearchScraper to scrape data and append tweets to tweets_list2
words_full = '$OCGN OR Oncugen' 
search_words = words_full
date_since = "2010-01-01"
date_until = "2021-06-08"

#We define the words or phrases we are going to search within the period of time spicified before.
search=search_words+' since:' +str(date_since)+' until:'+str(date_until)

# We create a loop for each tweet found with the "search" criteria. This loop will find as many tweets it can find,
# within the "i" variable limit.
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(search).get_items()):
    if i>50000:
        break
    tweets_list2.append([tweet.date, tweet.id, tweet.content, tweet.username])
    
# Creating a dataframe from the tweets list above
tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

# This method grops the tweets by day
def groupbyday(tweets):
    df_tweets_day = (pd.to_datetime(tweets['date'])
                .dt.floor('d'))
    tweets['date2'] =  df_tweets_day
    tweets['count'] = 1
    total = tweets.groupby(['date2'])['polarity'].count()
    new_df = tweets.groupby(['date2','sentiment']).sum()
    new_df = pd.merge(tweets, total, on='date2', how='left')
    
    #normalize the positive, neutral, and negative sentiments, by dividing for the total tweets per day
    new_df['polarity_norm'] = new_df['count']/new_df['polarity_y']
    new_df=new_df.groupby(['date2','sentiment'])['polarity_norm'].aggregate(sum)

    return new_df

#Clean tweets
def clean_tweets(tweets):
    df_tweets = pd.DataFrame()
    stop_words = set(stopwords.words('english'))
    for tweet in tweets['Text']:       
        sentence = tweet
        sentence = sentence.lower()
        word_tokens = word_tokenize(tweet)
        #removing mentions
        tweet_text = re.sub(r':', '', tweet)
        tweet_text = re.sub(r'‚Ä¶', '', tweet_text)
        tweet_text = re.sub(r'@','', tweet_text)
        tweet_text = re.sub(r'\n','', tweet_text)

        #replace consecutive non-ASCII characters with a space
        tweet_text = re.sub(r'[^\x00-\x7F]+',' ', tweet_text)
        words = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in string.punctuation:
                words.append(w)
        
        # Here we do the Blob package sentimental analysis. We decided to only classify negative or positive feelings,
        # avoiding "light sentiments"
        blob = TextBlob(tweet_text)
        Sentiment_rating = blob.sentiment
        polarity = Sentiment_rating.polarity
        subjectivity = Sentiment_rating.subjectivity
        if polarity >= 0.1:
            Sentiment = "positive"
        elif polarity < -0.1:
            Sentiment = "negative"
        else:
            Sentiment = "neutral"

        df_tweets = df_tweets.append({'tweet':sentence,
                                      'words': words,
                                      'sentiment_rating': Sentiment_rating,
                                      'subjectivity': subjectivity,
                                      'polarity': polarity,
                                      'sentiment': Sentiment}, ignore_index=True)
        df_tweets['date'] = tweets['Datetime']
    return df_tweets



tweets = clean_tweets(tweets_df2)

tweets.to_csv('.\sentimentanalysis'+ words_full  + '.csv',encoding='utf-8-sig')

tweets = groupbyday(tweets)

tweets.to_csv('.\sentimentanalysis_count'+ words_full  + '.csv',encoding='utf-8-sig')


In [3]:
tweets

date2                      sentiment
2021-06-07 00:00:00+00:00  neutral      0.545455
                           positive     0.454545
Name: polarity_norm, dtype: float64