In [None]:
# Import required modules
import pandas as pd
import re                                           # regular expressions
import nltk
import nltk.corpus as corpus                        # nltk package for reading files
nltk.download('stopwords')
from nltk.corpus import stopwords as stopwords      # words such as 'the','be','an', that don't significantly impact the sentiment
from nltk.stem import WordNetLemmatizer             # Groups words
from nltk.tokenize import word_tokenize             # Splits text into tokens
import collections

# Will be used later on
import matplotlib as mpl
import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set()
# import sklearn
# import numpy as np

In [None]:
# Filter out all unlabelled sentiment
p_df = pd.read_csv('/home/mia/Documents/College/CS4811/TwitterNLP/tweets_labelled.csv', sep=';',names=['created_at','text','sentiment']).query('sentiment == "positive"')
n_df = pd.read_csv('/home/mia/Documents/College/CS4811/TwitterNLP/tweets_labelled.csv', sep=';',names=['created_at','text','sentiment']).query('sentiment == "negative"')

# Debug Print
# display(p_df)
# display(n_df)

# Combine dataframes
frames = [p_df, n_df]
df = pd.concat(frames)

# Sort by date
# https://www.geeksforgeeks.org/how-to-sort-a-pandas-dataframe-by-date/
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.sort_values(by="created_at",ascending=True)
# Debug Print
display(df)


In [84]:
# Grab the top 25 stocks
ticker_pattern = re.compile(r'(^\$[A-Z]+|^\$ES_F)')

# Break the string into a dictionary
ticker_freq_dict = collections.defaultdict(int)
# ticker_dict = collections.defaultdict(str)

# Count the frequencies of the mentions of each stock
for text in df['text']:
    for word in text.split():
        if ticker_pattern.fullmatch(word) is not None:
            if "$" + word not in ticker_freq_dict:
                ticker_freq_dict["$" + word[1:]] += 1
    


# Only store the top 25 mentioned stocks
ticker_freq_df = pd.DataFrame.from_dict(ticker_freq_dict, orient='index', columns=['freq']).nlargest(25, 'freq')
display(ticker_freq_df.sort_values('freq', ascending=False))


Unnamed: 0,freq
$SPX,132
$AMZN,80
$AAPL,79
$SPY,69
$FB,59
$TSLA,40
$MSFT,32
$QQQ,29
$NFLX,27
$JPM,22


### **`ticker_dict`** intended structure
There will be a key for each of the 25 most frequently mentioned stocks. Each Stock will have a list of all the sentiments about tweets relating to the stock, and the associated timestamp the tweet was made.
```
ticker_dict = {
    "$SPX": [
        [
            "Timestamp",
            "Sentiment" (positive or negative)
        ]
    ],
    ,
    "$GOOGL": [
        [
            "Timestamp",
            "Sentiment" (positive or negative)
        ]
    ]
}
```

In [85]:
# Create a dictionary using the above structure
ticker_dict = collections.defaultdict(str)

# Iterate over all the tweets
for text in df['text']:
    # Determine which stocks the tweet is associated with, and store with appropriate stock
    for ticker in ticker_freq_df.index:
        if text.find(ticker) > 0:
            # Put the store the sentiment and rating in a tuple to be stored in a list
            row_df = df.query('text == @text')
            timestamp = str(row_df.iat[0,0])
            sentiment = row_df.iat[0,2]
            entry = [timestamp, sentiment]
            # print(entry)

            if ticker not in ticker_dict:
                ticker_dict[ticker] = [entry]
                continue
            else:
                ticker_dict[ticker].append(entry)
                continue

In [86]:
# Print out the dictionary to json file for debugging purposes
import json

with open('ticker_dict.json','w') as fout:
    json.dump(ticker_dict, fout, indent=4)

# # Confirm the correct amount of tweets have been written to the file
# for ticker in ticker_freq_df.index:
#     if len(ticker_dict[ticker]) != ticker_freq_dict[ticker]:
#         print("ERROR: " + ticker + " expected: " + str(ticker_freq_dict[ticker]) + ", Actual: " + str(len(ticker_dict[ticker])))

# with open('entry.json', 'w') as fout:
#     json.dump(ticker_dict['$JNJ'], fout, indent=4)

In [87]:
# Convert to a ticker_dict toDataframe
sentiment_df = pd.DataFrame.from_dict(ticker_dict, orient='index')
display(sentiment_df.head(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,109,110,111,112,113,114,115,116,117,118
$AMZN,"[2020-04-09 00:00:03+00:00, positive]","[2020-04-13 17:30:05+00:00, positive]","[2020-04-13 18:12:46+00:00, negative]","[2020-04-13 19:54:14+00:00, positive]","[2020-04-14 01:34:20+00:00, positive]","[2020-04-14 11:22:42+00:00, positive]","[2020-04-14 13:15:40+00:00, positive]","[2020-04-14 17:07:34+00:00, negative]","[2020-04-15 13:49:55+00:00, positive]","[2020-04-16 20:45:19+00:00, positive]",...,,,,,,,,,,
$FB,"[2020-04-09 00:00:03+00:00, positive]","[2020-04-13 21:00:53+00:00, positive]","[2020-04-14 11:22:42+00:00, positive]","[2020-04-14 13:15:40+00:00, positive]","[2020-04-24 06:59:03+00:00, positive]","[2020-04-28 17:14:33+00:00, negative]","[2020-04-29 20:19:11+00:00, positive]","[2020-04-29 20:25:10+00:00, positive]","[2020-05-28 04:48:49+00:00, negative]","[2020-05-28 14:15:43+00:00, positive]",...,,,,,,,,,,
$TSLA,"[2020-04-09 00:00:03+00:00, positive]","[2020-04-13 21:00:53+00:00, positive]","[2020-04-27 23:35:41+00:00, positive]","[2020-05-08 14:50:13+00:00, positive]","[2020-05-29 03:22:36+00:00, negative]","[2020-05-30 19:23:15+00:00, negative]","[2020-06-01 18:37:04+00:00, positive]","[2020-06-04 18:21:58+00:00, positive]","[2020-06-05 09:34:45+00:00, positive]","[2020-06-09 07:44:58+00:00, negative]",...,,,,,,,,,,
$ZM,"[2020-04-09 00:00:03+00:00, positive]","[2020-04-17 19:07:37+00:00, positive]","[2020-04-23 09:30:37+00:00, positive]","[2020-05-05 16:16:42+00:00, negative]","[2020-05-30 19:23:15+00:00, negative]","[2020-06-02 15:33:15+00:00, positive]","[2020-06-05 09:34:45+00:00, positive]","[2020-06-23 15:33:22+00:00, positive]","[2020-07-06 02:31:16+00:00, positive]","[2020-07-11 06:06:23+00:00, positive]",...,,,,,,,,,,
$T,"[2020-04-09 00:00:03+00:00, positive]","[2020-04-09 20:17:34+00:00, positive]","[2020-04-10 15:56:50+00:00, negative]","[2020-04-13 20:18:54+00:00, negative]","[2020-04-13 21:00:53+00:00, positive]","[2020-04-17 13:50:02+00:00, positive]","[2020-04-17 14:54:41+00:00, negative]","[2020-04-17 15:01:31+00:00, negative]","[2020-04-18 15:00:57+00:00, positive]","[2020-04-21 12:39:06+00:00, positive]",...,,,,,,,,,,


### **`sentiment_dict`** intended structure
There will be a key for each of the 25 most frequently mentioned stocks. Each Stock have a list of days an the associated percentage of positive tweets.
```
ticker_dict = {
    "$SPX": [
        "Day": weight,
        "Day": weight,
        ...
    ],
}
```

In [None]:
# Return a weight from 0-1 based on the percentage of positive tweets
stock_weight_dict = dict()
round_time_pattern = " " + ".*"
current_time = ""
total = 0
total_positive = 0

created = []

for stock in sentiment_df.index:
    tweets = sentiment_df.loc[stock]
       
    for tweet in tweets:
        if tweet is not None:
            # print(tweet)
            # round tweet to the day
            # print(re.sub(round_time_pattern,'', tweet[0]))
                        
            current_time = re.sub(round_time_pattern, '', tweet[0])
            current_time = re.sub(r'-','/', current_time)
            
            if current_time not in stock_weight_dict:
                stock_weight_dict[current_time] = []
            
            # If it is a new date, reset the counter and calculate the percentage
            if current_time not in created:
                # If the total is 0, it is the first round
                if total != 0:
                    # print(current_time)
                    # print(total)
                    # print(total_positive)
                    weight = round(total_positive/total, 2)
                    if weight == .5:
                        weight = 0
                    elif weight < .5:
                        weight = -1
                    elif weight > .5:
                        weight = 1
                    stock_weight_dict[current_time].append({stock: weight})
                    total = 0
                    total_positive = 0
                else:
                    total += 1
                    if tweet[1] == "positive":
                        total_positive += 1
                created.append(current_time)
            else:
                total += 1
                if tweet[1] == "positive":
                    total_positive += 1
                
            # print(created)
    # break
    created.clear()

In [None]:
# Debug
import pprint
pp = pprint.PrettyPrinter(indent=4)           
pp.pprint(stock_weight_dict)

with open('sentiment.json', 'w') as fout:
    json.dump(stock_weight_dict, fout, indent=4)

In [88]:
def stock_to_csv(stock):
    # create a dataframe of just the stock and its date
    counter = 0
    single_stock_df = pd.DataFrame(columns=['Date', stock])
    for date in stock_weight_dict:
        for entries in stock_weight_dict[date]:
            if stock in entries:
                single_stock_df.loc[counter] = [date, entries[stock]]
                counter = counter + 1
                break
    stock = re.sub(r'\$','',stock)
    # print(stock)
    single_stock_df.to_csv("/home/mia/Documents/College/CS4811/TwitterNLP/twt_data/"+stock+".csv",index=False)
    return
    
for stock in ticker_freq_df.index:
    stock_to_csv(stock)

for ticker in ticker_freq_df.index:
    ticker_freq_df.rename(index={ticker: re.sub(r"\$","",ticker)}, inplace=True)
ticker_freq_df.to_csv("/home/mia/Documents/College/CS4811/TwitterNLP/twt_data/stocks_list.csv")