In [1]:
import nltk
import pandas as pd
import re
import string

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kirsten\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [33]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kirsten\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
stop_words = set(stopwords.words('english'))

In [2]:
def words(text) -> list:
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    words = [i for i in words if i not in list(stop_words)]

    return words


In [3]:
def sentiment(content):
    analyzer = SentimentIntensityAnalyzer()
    content = ' '.join(words(content))
    score = analyzer.polarity_scores(content)
    return score['compound']

In [4]:
def main(filewithcontent, outputfile):
    df = pd.read_csv(f"./Data/Tesla/Clean/{filewithcontent}.csv")
    df = df.loc[~(df.content.isna())]
    df.loc[:, 'compound'] = df.content.apply(lambda x: sentiment(x))
    df.to_csv(f"./Data/Tesla/Clean/{outputfile}.csv", index=False)
    return df

In [104]:
main("1w_content", "1w_score")

Unnamed: 0,Title,url,content,date,compound
0,Tesla factory worker: Elon Musk doesn't have w...,https://www.cnn.com/2020/05/14/tech/tesla-work...,Afterwinning a public standoffwith California'...,2020-05-14,0.9324
1,Elon Musk is using his power to be selfish,https://www.cnn.com/2020/05/01/opinions/elon-m...,"His words were tone deaf and, for someone with...",2020-05-01,-0.2732
2,Jerome Powell is asking for help. Will anyone ...,https://www.cnn.com/2020/05/14/investing/prema...,A version of this story first appeared in CNN ...,2020-05-14,0.9048
3,Analysis: What happened to Elon Musk?,https://www.cnn.com/2020/05/13/business/elon-m...,Musk deftly played the part of a new kind of C...,2020-05-13,0.9911
4,"After years without turning a profit, Tesla is...",https://www.cnn.com/2020/05/13/business/tesla-...,It is a remarkable achievement for a company t...,2020-05-13,0.9985
...,...,...,...,...,...
94,Hot Wheels to launch a radio-controlled Cybert...,https://www.cnn.com/2020/02/21/business/tesla-...,Mattel has announced that it is releasing two ...,2020-02-21,0.4939
95,How Bernie Sanders could actually be helping t...,https://www.cnn.com/2020/02/20/investing/stock...,"Despite the Sanders surge, or perhaps because ...",2020-02-20,0.9977
96,The Dow and Nasdaq are approaching big milesto...,https://www.cnn.com/2020/02/19/investing/dow-n...,The Dow and Nasdaq are both approaching splash...,2020-02-19,0.9588
98,Solar might be Tesla's secret weapon,https://www.cnn.com/2020/02/19/investing/tesla...,"Alex Potter, an analyst with Piper Sandler, ra...",2020-02-19,0.9879


In [7]:
for i in range(3, 9):
    main(f"{i}w_content", f"{i}w_score")