In [1]:
# Now that we've got our easy data, let's move to the two important factors: Sentiment Score and Stock History.
# For Stock History, we're going to grab the close value of every stock for the past two years.
#     I know I originally said the past six weeks, but this seems silly.  We have *centuries* of stock data.
#     Why not use more of it?  However, the yahoo API does not let us access the value of the close when the
#     market is not open, such as on weekends or holidays.  We'll simply take the average of the last known
#     dates for that.
# For sentiment score, it's time to calculate it.
#     See the 'Final Project - Sentiment Score' notebook for the training of the model.

# Now that we've stored this model into this variable, we can load it and use it again.  Let's test it.
import torch
import ml_collections
from transformers import RobertaForSequenceClassification, AutoTokenizer
import emoji
from wordcloud import WordCloud, STOPWORDS
import re,string, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from accelerate import Accelerator

def model_config():
    cfg_dictionary = {
        "data_path": "sentiments.csv",
        "model_path": "models/bert_model.h5",
        "model_type": "transformer",
        "model_checkpoint":"roberta-base",
    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()

model = RobertaForSequenceClassification.from_pretrained(cfg.model_path)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint,use_fast=True)
model.to('cuda')
accelerator = Accelerator()
model = accelerator.prepare(model)

def score_sentiments(sentiments):
    score = 0
    for sentiment in sentiments:
        input = tokenizer(sentiment, return_tensors="pt")
        input.to('cuda')
        with torch.no_grad():
            logits = model(**input).logits

        predicted_class_id = logits.argmax().item()
        #Since class is is 0 - negative, 1 - neutral, 2 - positive, we can easily use this to calculate our score
        score = score + (predicted_class_id-1)
    return score

In [2]:
# Text processing Methods
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
STOPWORDS.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm','im', 'll', 'y', 've', 
                      'u', 'ur', 'don','p', 't', 's', 'aren', 'kp', 'o', 'kat', 
                      'de', 're', 'amp', 'will'])
    
def preprocess_text(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would",text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    #text = re.sub('[^a-zA-Z]',' ',text)
    return text

def clean_text(field):
    field = preprocess_text(field)
    field = field.replace(r"http\S+"," ")
    field = field.replace(r"http"," ")
    field = field.replace(r"@","at")
    field = field.replace("#[A-Za-z0-9_]+", ' ')
    field = field.replace(r"[^A-Za-z()!?@\'\"_\n]"," ")
    field = field.replace("["," ")
    field = field.replace("]"," ")
    field = field.replace("\"","'")
    field = field.lower()
    return field 

In [3]:
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import requests
from tqdm.auto import tqdm

end_date = datetime.now()
start_date = end_date - relativedelta(weeks=110) - relativedelta(days=1)
end_date_formatted = str(end_date).split(" ")[0]
start_date_formatted = str(start_date).split(" ")[0]

print("Gathering stock information from 47 symbols.")
progress_bar = tqdm(
        range(47 * 730)
    )
df = pd.read_csv("Largest automakers by market capitalization.csv")
stock_history = [['Symbol',f'History starting at {start_date_formatted}']]
for symbol in df['Symbol'].values:
    print(symbol)
    data = yf.download(symbol, start=start_date_formatted, end=end_date_formatted)
    target_date = start_date
    closing_values = []
    
    while target_date <= end_date:
        try:
            target_date_formatted = str(target_date).split(" ")[0]
            data.loc[target_date_formatted]
        except:
            #print(f"Error on symbol {symbol}")
            closing_values.append('ERR')      
        else:
            target_row = data.loc[target_date_formatted]
            closing_values.append(target_row['Close'])
        progress_bar.update(1)
        target_date = target_date + relativedelta(days=1)
    stock_history.append([symbol,closing_values])
stock_history

Gathering stock information from 47 symbols.


  0%|          | 0/34310 [00:00<?, ?it/s]

TSLA
[*********************100%***********************]  1 of 1 completed
TM
[*********************100%***********************]  1 of 1 completed
VOW3.DE
[*********************100%***********************]  1 of 1 completed
002594.SZ
[*********************100%***********************]  1 of 1 completed
DAI.DE
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- DAI.DE: No data found, symbol may be delisted
GM
[*********************100%***********************]  1 of 1 completed
F
[*********************100%***********************]  1 of 1 completed
BMW.DE
[*********************100%***********************]  1 of 1 completed
STLA
[*********************100%***********************]  1 of 1 completed
HMC
[*********************100%***********************]  1 of 1 completed
RIVN
[*********************100%***********************]  1 of 1 completed
601633.SS
[*********************100%***********************]  1 of 1 completed
LCID
[*********************100%****

[['Symbol', 'History starting at 2020-03-15'],
 ['TSLA',
  ['ERR',
   89.01399993896484,
   86.04000091552734,
   72.24400329589844,
   85.52799987792969,
   85.50599670410156,
   'ERR',
   'ERR',
   86.85800170898438,
   101.0,
   107.8499984741211,
   105.63200378417969,
   102.87200164794922,
   'ERR',
   'ERR',
   100.4260025024414,
   104.80000305175781,
   96.31199645996094,
   90.89399719238281,
   96.00199890136719,
   'ERR',
   'ERR',
   103.24800109863281,
   109.08999633789062,
   109.76799774169922,
   114.5999984741211,
   'ERR',
   'ERR',
   'ERR',
   130.19000244140625,
   141.97799682617188,
   145.96600341796875,
   149.04200744628906,
   150.7779998779297,
   'ERR',
   'ERR',
   149.27200317382812,
   137.343994140625,
   146.4219970703125,
   141.12600708007812,
   145.02999877929688,
   'ERR',
   'ERR',
   159.75,
   153.82400512695312,
   160.1020050048828,
   156.37600708007812,
   140.26400756835938,
   'ERR',
   'ERR',
   152.23800659179688,
   153.6419982910156

In [4]:
# Now that we have the stock history, we need to average out errors and record symbols that are too erroneous.
symbols_to_delist = []
for stock in stock_history[1:]:
    print(f'Now processing {stock[0]}.')
    records_to_average = 0;
    history = stock[1]
    for record in range(0, len(history)):
        if(history[record] == 'ERR' and records_to_average < 30):
            records_to_average = records_to_average + 1;
        elif(history[record] != 'ERR' and records_to_average < 30 and records_to_average > 0):
            try:
                if(record - (records_to_average + 1) < 0):
                    last_known = float(history[record])
                else:
                    last_known = float(history[record - (records_to_average + 1)])
                newest_val = float(history[record])
            except:
                print(f'Error on symbol {stock[0]} for record {record - (records_to_average + 1)}: should not be ERR but is')
                break
            else:
                average = (last_known + newest_val) / 2.0
                for x in range(1, records_to_average+1):
                    history[record - x] = average
                records_to_average = 0
        elif(records_to_average >= 30):
            symbols_to_delist.append(stock[0])
            break
        stock[1] = history
symbols_to_delist.append("VOW3.DE") #Unfortunately, I'm appending this manually becase we only have nine samples.
symbols_to_delist

Now processing TSLA.
Now processing TM.
Now processing VOW3.DE.
Now processing 002594.SZ.
Now processing DAI.DE.
Now processing GM.
Now processing F.
Now processing BMW.DE.
Now processing STLA.
Now processing HMC.
Now processing RIVN.
Now processing 601633.SS.
Now processing LCID.
Now processing RACE.
Now processing NIO.
Now processing HYMTF.
Now processing 600104.SS.
Now processing MARUTI.NS.
Now processing XPEV.
Now processing LI.
Now processing 000270.KS.
Now processing TTM.
Now processing VOLCAR-B.ST.
Now processing 0175.HK.
Now processing 7201.T.
Now processing 7269.T.
Now processing M&M.NS.
Now processing 000625.SZ.
Now processing 7270.T.
Now processing 2207.TW.
Now processing RNSDF.
Now processing 7202.T.
Now processing 0489.HK.
Now processing PII.
Now processing 000800.SZ.
Now processing 7261.T.
Now processing FSR.
Now processing NKLA.
Now processing ARVL.
Now processing A5SA.F.
Now processing GOEV.
Now processing REE.
Now processing RIDE.
Now processing SEV.
Now processing CEN

['DAI.DE',
 'RIVN',
 'LCID',
 'XPEV',
 'LI',
 'VOLCAR-B.ST',
 'REE',
 'SEV',
 'VOW3.DE']

In [5]:
stock_history

[['Symbol', 'History starting at 2020-03-15'],
 ['TSLA',
  [89.01399993896484,
   89.01399993896484,
   86.04000091552734,
   72.24400329589844,
   85.52799987792969,
   85.50599670410156,
   86.18199920654297,
   86.18199920654297,
   86.85800170898438,
   101.0,
   107.8499984741211,
   105.63200378417969,
   102.87200164794922,
   101.64900207519531,
   101.64900207519531,
   100.4260025024414,
   104.80000305175781,
   96.31199645996094,
   90.89399719238281,
   96.00199890136719,
   99.625,
   99.625,
   103.24800109863281,
   109.08999633789062,
   109.76799774169922,
   114.5999984741211,
   122.39500045776367,
   122.39500045776367,
   122.39500045776367,
   130.19000244140625,
   141.97799682617188,
   145.96600341796875,
   149.04200744628906,
   150.7779998779297,
   150.0250015258789,
   150.0250015258789,
   149.27200317382812,
   137.343994140625,
   146.4219970703125,
   141.12600708007812,
   145.02999877929688,
   152.38999938964844,
   152.38999938964844,
   159.75,
 

In [6]:
# Okay, so we've got about ~25 sentiment csv's for a number of companies.  Now we need to combine all of them together,
# recording the country, name, symbol, date collected, and sentiments for the company.  We'll worry about the actual stock
# values later, since those will need their own processing to fill in gaps.  
import glob
import csv

alldata = [['Country','Name','Symbol','Date Collected','Sentiments','Sentiment Score', 'Stock History', 'Stock Future']]

filedir='FinalProjectData_*.csv'
print(f'{len(glob.glob(filedir))} files to process.')
progress_bar = tqdm(
        range(len(glob.glob(filedir)) * 47)
    )
for file in glob.glob(filedir):
    print(f'Processing {file}...')
    date_collected = file.split("_")[1].split('.')[0]
    with open(file, newline='',encoding="utf8") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader) #Skip the first line (header information)
        for row in reader:
            country = row[0]
            name = row[1]
            symbol = row[2]
            sentiments = row[3]
            #Now, some of this data is unusable - not every stock gave us good sentiment data, or any at all.
            #We need to ignore these empty rows.
            if((sentiments != 'No data!') and (symbol not in symbols_to_delist)):  
                #Calculate the secntiment score
                sentiments = clean_text(sentiments)
                sentiments_array = sentiments.split('\',')
                for x in range(0,len(sentiments_array)):
                    sentiments_array[x] = sentiments_array[x].replace('\'','')
                score = score_sentiments(sentiments_array)
                #Grab stock history
                for stock in stock_history:
                    if(stock[0] == symbol):
                        my_history = stock[1]
                        break
                # format
                format = '%Y-%m-%d'
                # convert from string format to datetime format
                date_collected_datetime = datetime.strptime(date_collected, format)
                difference = datetime.now() - date_collected_datetime
                two_years_ago = date_collected_datetime - relativedelta(weeks=104)
                start_of_time = datetime.now() - relativedelta(weeks=110) - relativedelta(days=1)
                difference2 = two_years_ago - start_of_time
                history = my_history[difference2.days:len(my_history)-difference.days]
                if(len(my_history)-difference.days+1 < len(my_history)):
                    future = my_history[len(my_history)-difference.days+1]
                    if (future== "ERR"):
                        print(f"Stock future error on {symbol} for record {len(my_history)-difference.days+1}.")
                        future = "UNKNOWN"
                else:
                    future = "UNKNOWN"
                alldata.append([country, name, symbol, date_collected, sentiments, score, history, future])
            progress_bar.update(1)
print('Done!')

24 files to process.


  0%|          | 0/1128 [00:00<?, ?it/s]

Processing FinalProjectData_2022-03-22.csv...
Processing FinalProjectData_2022-03-23.csv...
Processing FinalProjectData_2022-03-24.csv...
Processing FinalProjectData_2022-03-25.csv...
Processing FinalProjectData_2022-03-26.csv...
Processing FinalProjectData_2022-03-29.csv...
Processing FinalProjectData_2022-03-30.csv...
Processing FinalProjectData_2022-03-31.csv...
Processing FinalProjectData_2022-04-01.csv...
Processing FinalProjectData_2022-04-02.csv...
Processing FinalProjectData_2022-04-05.csv...
Processing FinalProjectData_2022-04-06.csv...
Processing FinalProjectData_2022-04-07.csv...
Processing FinalProjectData_2022-04-08.csv...
Processing FinalProjectData_2022-04-09.csv...
Processing FinalProjectData_2022-04-12.csv...
Processing FinalProjectData_2022-04-14.csv...
Processing FinalProjectData_2022-04-15.csv...
Processing FinalProjectData_2022-04-16.csv...
Processing FinalProjectData_2022-04-19.csv...
Processing FinalProjectData_2022-04-20.csv...
Processing FinalProjectData_2022-0

In [7]:
#Okay, we've got our dataset!  Let's move to pandas now that we're done iterating and save it locally.
import pandas as pd
df = pd.DataFrame(alldata[1:], columns=alldata[0])
df.to_csv("FinalProjectDataALL.csv",index=False)
df

Unnamed: 0,Country,Name,Symbol,Date Collected,Sentiments,Sentiment Score,Stock History,Stock Future
0,United States,Tesla,TSLA,2022-03-22,'tesla opens first european gigafactory in be...,5,"[86.85800170898438, 101.0, 107.8499984741211, ...",1013.919983
1,Japan,Toyota,TM,2022-03-22,"'toyota pauses production. covid-19, not chip...",-2,"[111.0999984741211, 116.3499984741211, 121.290...",180.270004
2,United States,General Motors,GM,2022-03-22,'super luxury brands like lamborghini and ben...,6,"[17.600000381469727, 21.110000610351562, 21.48...",44.349998
3,United States,Ford,F,2022-03-22,'super luxury brands like lamborghini and ben...,-3,"[4.010000228881836, 4.949999809265137, 5.38999...",16.83
4,Netherlands,Stellantis,STLA,2022-03-22,"'maserati unveils new suv grecale, full elect...",2,"[6.349999904632568, 7.010000228881836, 7.42999...",15.98
...,...,...,...,...,...,...,...,...
475,United States,Canoo,GOEV,2022-04-23,'is canoo inc. is (nasdaq:goev) shareholder o...,1,"[10.140000343322754, 10.144999980926514, 10.14...",UNKNOWN
476,United States,Lordstown Motors,RIDE,2022-04-23,'analyst on tesla robotaxis: 'i will believe ...,6,"[10.039999961853027, 10.005000114440918, 10.00...",UNKNOWN
477,United States,Cenntro Electric Group,CENN,2022-04-23,'cenntro electric group to host 2021 year end...,9,"[7.2779998779296875, 9.488999843597412, 9.4889...",UNKNOWN
478,Canada,Electra Meccanica,SOLO,2022-04-23,'electrameccanica appoints automotive veteran...,7,"[1.0099999904632568, 1.0149999856948853, 1.014...",UNKNOWN
