In [1]:
import pysentiment as ps
import nltk
from spacy.lang.en import English
# importing list of stop words
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
import time

nlp = English()

In [2]:
# gets rid of stop words and returns a string
def stop_words(text):
    doc = nlp(text)
    filtered_sent = []
    # filtering stop words
    for word in doc:
        if word.is_stop==False:
            filtered_sent.append(str(word))
    filtered_sent = ' '.join(filtered_sent)
    return filtered_sent

In [3]:
# Sentiment analyzer
lm = ps.LM()


<h1> Working with JSON files

In [4]:
#Imports
import pandas as pd
import numpy as np
import glob
import re
import pickle
from datetime import datetime


# Load all company transcripts
all_transcripts = glob.glob("data/company_transcripts/*")
df = [pd.read_json(f) for f in all_transcripts]

# Pre-processing functions


def get_quarter(title):
    quarter_reg_expr = "([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*)"
    words = re.findall(quarter_reg_expr, title)

    def has_numbers(inputString):
        return any(char.isdigit() for char in inputString)

    for word in words:
        if "q" in word.lower() and has_numbers(word):
            return word
            # Handle Q1 vs 1Q case
            if len(word) == 2:
                if word[0] in "1234":
                    return word[1] + word[0]
                else:
                    return word
            # Handle case with F
            elif word[0] == "F":
                return word[2] + word[1]
    return None


def get_ticker(row, _id):
    title = row['title'][_id]
    body = ' '.join(row['body'][_id])

    try:
        # Try to get ticker from title
        open_paren = title.index("(")
        close_paren = title.index(")")
        return title[open_paren+1:close_paren]

    except:
        # find ticker in body
        colon = body.index(":")
        close_paren = body.index(")")
        return body[colon+1:close_paren]


# Clean
rows_list = []

for row in df:
    ids = list(row['title'].keys())

    for _id in ids:
        title = row['title'][_id]
        full_date = row['date'][_id]

        ticker = get_ticker(row, _id)
        quarter = get_quarter(title)
        date = full_date.date().strftime("%Y-%m-%d")
        time = full_date.time().strftime("%H:%M")
        body = ' '.join(row['body'][_id])

        # There are 10 titles without an explicit quarter
        if quarter is None:
            quarter = "FILL ME IN"

        new_row = [ticker, quarter, date, time, body]

        rows_list.append(new_row)

df_transcripts = pd.DataFrame(
    rows_list, columns=["ticker", "quarter", "date", "time", "body"])


with open("transcripts.p", "wb") as f:
    pickle.dump(df_transcripts, f)

In [5]:
# creating new instance of JSON dataframe
new = df_transcripts

Unnamed: 0,ticker,quarter,date,time,body
0,T,Q1,2019-04-24,08:30,AT&T Inc. (NYSE:T) Q1 2019 Earnings Conference...
1,T,Q4,2018-01-31,16:30,"AT&T, Inc. (NYSE:T) Q4 2017 Results Earnings C..."
2,T,Q3,2016-10-24,08:30,"AT&T, Inc. (NYSE:T) Q3 2016 Earnings Call Octo..."
3,T,Q2,2015-07-23,16:30,AT&T Inc. (NYSE:T) Q2 2015 Earnings Conference...
4,T,Q3,2015-10-22,16:30,"AT&T, Inc. (NYSE:T) Q3 2015 Earnings Call Octo..."
5,T,Q3,2014-10-22,16:30,AT&T Inc. (NYSE:T) Q3 2014 Results Earnings Co...
6,T,Q4,2015-01-27,16:30,AT&T Inc. (NYSE:T) Q4 2014 Earnings Conference...
7,T,Q3,2013-10-23,16:30,AT&T Inc. (NYSE:T) Q3 2013 Earnings Call Octob...
8,T,Q4,2014-01-28,16:30,AT&T Inc. (NYSE:T) Q4 2013 Earnings Conference...
9,T,Q2,2013-07-23,16:30,AT&T Inc. (NYSE:T) Q2 2013 Earnings Call July ...


In [6]:
# Add new columns to df
new["sentiment"] = ""
new["positive"] = ""
new["negative"] = ""
new["subjectivity"] = ""

In [7]:
import time
t0 = time.time()

for i in range(new.shape[0]):
    text = new['body'][i]
    text_no_stop_words = stop_words(text)
    
    tokens = lm.tokenize(text_no_stop_words)
    score = lm.get_score(tokens)
    
    new["sentiment"][i] = score['Polarity']
    new["positive"][i] = score["Positive"]
    new["negative"][i] = score["Negative"]
    new["subjectivity"][i] = score["Subjectivity"]

time.time() - t0

1551.0550129413605

In [8]:
new

Unnamed: 0,ticker,quarter,date,time,body,sentiment,positive,negative,subjectivity
0,T,Q1,2019-04-24,08:30,AT&T Inc. (NYSE:T) Q1 2019 Earnings Conference...,0.178707,155,108,0.0747159
1,T,Q4,2018-01-31,16:30,"AT&T, Inc. (NYSE:T) Q4 2017 Results Earnings C...",0.477551,181,64,0.0693462
2,T,Q3,2016-10-24,08:30,"AT&T, Inc. (NYSE:T) Q3 2016 Earnings Call Octo...",0.259434,267,157,0.103063
3,T,Q2,2015-07-23,16:30,AT&T Inc. (NYSE:T) Q2 2015 Earnings Conference...,0.531746,193,59,0.0809249
4,T,Q3,2015-10-22,16:30,"AT&T, Inc. (NYSE:T) Q3 2015 Earnings Call Octo...",0.494424,201,68,0.0761824
5,T,Q3,2014-10-22,16:30,AT&T Inc. (NYSE:T) Q3 2014 Results Earnings Co...,0.317365,110,57,0.0575862
6,T,Q4,2015-01-27,16:30,AT&T Inc. (NYSE:T) Q4 2014 Earnings Conference...,0.257732,122,72,0.0576866
7,T,Q3,2013-10-23,16:30,AT&T Inc. (NYSE:T) Q3 2013 Earnings Call Octob...,0.366516,151,70,0.0720339
8,T,Q4,2014-01-28,16:30,AT&T Inc. (NYSE:T) Q4 2013 Earnings Conference...,0.298701,150,81,0.0681818
9,T,Q2,2013-07-23,16:30,AT&T Inc. (NYSE:T) Q2 2013 Earnings Call July ...,0.333333,158,79,0.065651


In [9]:
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

adj_closed_df = pd.read_csv('data/adj_close.csv')

new = new.astype({'positive': np.float64, 'negative': np.float64, 'sentiment': np.float64})


adj_closed_df.head(10)

Unnamed: 0,Ticker symbol,Date,Close,Returns
0,AAL,1/3/2012,5.12,0.0
1,AAL,1/4/2012,5.03,-0.017578
2,AAL,1/5/2012,5.47,0.087475
3,AAL,1/6/2012,5.6,0.023766
4,AAL,1/9/2012,5.72,0.021429
5,AAL,1/10/2012,5.72,0.0
6,AAL,1/11/2012,5.91,0.033217
7,AAL,1/12/2012,6.05,0.023689
8,AAL,1/13/2012,5.97,-0.013223
9,AAL,1/17/2012,5.88,-0.015075


In [76]:
# setting new instances of new and adj_closed.df
sentiment = new
stocks = adj_closed_df

In [77]:
sentiment['date'] = pd.to_datetime(sentiment['date'])
stocks['Date'] = pd.to_datetime(stocks['Date'])

In [78]:
# sentiment_df = sentiment.set_index('date')
# # stocks_df = stocks.set_index("Date")
stocks_df = stocks
stocks_df.reset_index()

Unnamed: 0,index,Ticker symbol,Date,Close,Returns
0,0,AAL,2012-01-03,5.120000,0.000000
1,1,AAL,2012-01-04,5.030000,-0.017578
2,2,AAL,2012-01-05,5.470000,0.087475
3,3,AAL,2012-01-06,5.600000,0.023766
4,4,AAL,2012-01-09,5.720000,0.021429
5,5,AAL,2012-01-10,5.720000,0.000000
6,6,AAL,2012-01-11,5.910000,0.033217
7,7,AAL,2012-01-12,6.050000,0.023689
8,8,AAL,2012-01-13,5.970000,-0.013223
9,9,AAL,2012-01-17,5.880000,-0.015075


In [79]:
data = pd.DataFrame(columns=["ticker", "quarter", "date", "sentiment", "returns"])

In [96]:
companies = pd.read_csv("data/companies.csv")

399    Telecommunication Services
Name: GICS Sector, dtype: object

In [99]:
all_rows = []

t0 = time.time()
for row in sentiment_df.iterrows():
    date = row[0]
    ticker = row[1]['ticker']
    quarter = row[1]['quarter']
    sentiment = row[1]['sentiment']
    subjectivity = row[1]['subjectivity']
    pos = row[1]['positive']
    neg = row[1]['negative']
    
    pos_neg_ratio = pos / neg

    company = companies[companies['Ticker symbol'] == ticker]
    
    sector = company['GICS Sector']
    sub_industry = company['GICS Sub Industry']
    
    try:
        idx_to_get = stocks_df[(stocks_df['Ticker symbol'] == ticker) & (stocks_df["Date"] == date)].index[0] + 1
    except: 
        continue

    returns = stocks_df.loc[idx_to_get, ]['Returns']

    new_row = [ticker, quarter, date, sentiment, subjectivity, pos_neg_ratio, sector, sub_industry, returns]

    all_rows.append(new_row)


data = pd.DataFrame(all_rows, columns=["ticker", "quarter", "date", "sentiment", "subjectivity", "pos_neg_ratio", "sector", "sub_industry", "returns"])

print(time.time() - t0)

data
    

500.89223074913025


Unnamed: 0,ticker,quarter,date,sentiment,subjectivity,pos_neg_ratio,sector,sub_industry,returns
0,T,Q1,2019-04-24,0.178707,0.074716,1.435185,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.014615
1,T,Q4,2018-01-31,0.477551,0.069346,2.828125,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,0.045661
2,T,Q3,2016-10-24,0.259434,0.103063,1.700637,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.004341
3,T,Q2,2015-07-23,0.531746,0.080925,3.271186,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,0.010610
4,T,Q3,2015-10-22,0.494424,0.076182,2.955882,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.006478
5,T,Q3,2014-10-22,0.317365,0.057586,1.929825,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.024348
6,T,Q4,2015-01-27,0.257732,0.057687,1.694444,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.003962
7,T,Q3,2013-10-23,0.366516,0.072034,2.157143,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.018424
8,T,Q4,2014-01-28,0.298701,0.068182,1.851852,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.011573
9,T,Q2,2013-07-23,0.333333,0.065651,2.000000,399 Telecommunication Services Name: GICS S...,399 Integrated Telecommunication Services N...,-0.011449


In [100]:
with open("all_data.p", "wb") as f:
    pickle.dump(data, f)