In [1]:
#import modules
import pandas as pd
import numpy as np
import re

# Custom preprocessing function
from utils import preprocess_text


# Vectorization methods
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
#read csv that includes stock data
transcribed_list= pd.read_csv('transcribed_transcript_list.csv')
transcribed_list.head(10)
print(transcribed_list['transcript'][0])

Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conference Call October 29, 2020  5:00 PM ET
Company Participants 
Tejas Gala - Senior Manager, Corporate Finance and Investor Relations
Tim Cook - Chief Executive Officer
Luca Maestri - Senior Vice President and Chief Financial Officer
Conference Call Participants
Shannon Cross - Cross Research
Jeriel Ong - Deutsche Bank
Katy Huberty - Morgan Stanley
Amit Daryanani - Evercore
Samik Chatterjee - JP Morgan
Krish Sankar - Cowen and Company
Kyle McNealy - Jefferies
Chris Caso - Raymond James
Operator
Good day everyone and welcome to the Apple Inc. Fourth Quarter Fiscal Year 2020 Earnings Conference Call. Today’s call is being recorded.
At this time for opening remarks and introductions, I would like to turn things over to Tejas Gala, Senior Analyst, Corporate Finance and Investor Relations. Please go ahead, sir.
Tejas Gala
Thank you. Good afternoon and thank you for joining us. Speaking first today is Apple’s CEO, Tim Cook, and he will be followed

In [3]:
#add column that includes the total character count for each earning call transcrip
transcribed_list['char_count']= transcribed_list['transcript'].str.len()
transcribed_list[['transcript', 'char_count']].head()

Unnamed: 0,transcript,char_count
0,Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conf...,49717
1,Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Confe...,46305
2,Apple Inc. (NASDAQ:AAPL) Q2 2020 Results Confe...,46869
3,Apple Inc. (NASDAQ:AAPL) Q1 2020 Results Confe...,46419
4,AbbVie Inc. (NYSE:ABBV) Q3 2020 Results Earnin...,75442


In [4]:
# reference link-https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
# function to remove punctuation
import re,string
#('[^\w\s]','')
def strip_links(text):
    link_regex    =  re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#',""]
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [5]:
#apply function above to transcript column
transcribed_list['no_punctuation']=transcribed_list['transcript'].map(lambda x:strip_all_entities(strip_links(x)))

print(transcribed_list['no_punctuation'][0:6])

0    Apple Inc NASDAQ AAPL Q4 2020 Earnings Confere...
1    Apple Inc NASDAQ AAPL Q3 2020 Results Conferen...
2    Apple Inc NASDAQ AAPL Q2 2020 Results Conferen...
3    Apple Inc NASDAQ AAPL Q1 2020 Results Conferen...
4    AbbVie Inc NYSE ABBV Q3 2020 Results Earnings ...
5    AbbVie Inc NYSE ABBV Q2 2020 Earnings Conferen...
Name: no_punctuation, dtype: object


In [6]:
#remove quotes and apostrophes
transcribed_list['no_punctuation']=transcribed_list['no_punctuation'].str.replace('[^\w\s]','')
print(transcribed_list['no_punctuation'][0:6])

0    Apple Inc NASDAQ AAPL Q4 2020 Earnings Confere...
1    Apple Inc NASDAQ AAPL Q3 2020 Results Conferen...
2    Apple Inc NASDAQ AAPL Q2 2020 Results Conferen...
3    Apple Inc NASDAQ AAPL Q1 2020 Results Conferen...
4    AbbVie Inc NYSE ABBV Q3 2020 Results Earnings ...
5    AbbVie Inc NYSE ABBV Q2 2020 Earnings Conferen...
Name: no_punctuation, dtype: object


In [None]:
#preprocess latest transcriped list with preprocess function in utils.py file

transcribed_list['preprocess_trans']= transcribed_list['no_punctuation'].apply(lambda x : ' '.join(preprocess_text(x, min_word_length=4)))
print(transcribed_list['preprocess_trans'][0:6])

In [None]:
# removal words used infrequently @10% of the amount of transcripts

in_freq_words = pd.Series(' '.join(transcribed_list['preprocess_trans']).split()).value_counts()[-400:]
in_freq_words.head()

In [None]:
# list frequent words @10% of the amount of transcripts

freq_words = pd.Series(' '.join(transcribed_list['preprocess_trans']).split()).value_counts()[400:]
freq_words.head()

In [None]:
# Removing the infrequent words from the  transcript file
in_freq_words = list(in_freq_words.index)
transcribed_list['clean_trans'] = transcribed_list['preprocess_trans'].apply(lambda x: " ".join(x for x in x.split() if x not in in_freq_words))
transcribed_list['clean_trans'].head()

In [None]:
#list to include most common words
all_words = []
for line in list(transcribed_list['clean_trans']):
    words = line.split()
    for word in words:
        all_words.append(word.lower())
print(all_words[10])

featured_words = list(all_words)[:1000]
print(featured_words)

In [None]:
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(12,5))
plt.xticks(fontsize=13, rotation=90)
fd = nltk.FreqDist(all_words)
#display top word frequency of the top 40 words
fd.plot(40,cumulative=False)

# Exports graph as a png image file
plt.savefig("Fig1Top40Words.png")

In [None]:
# log-log of all words
from collections import Counter
word_counts = sorted(Counter(all_words).values(), reverse=True)
plt.figure(figsize=(12,5))
plt.loglog(word_counts, linestyle='-', linewidth=1.5)
plt.ylabel("Word Frequency")
plt.xlabel("Word Ranking")

# Exports graph as a png image file
plt.savefig("Fig2WordFreqandRank.png")

In [None]:
#verify latest dataframe column output
transcribed_list.head()

In [None]:
#remove stop words
from sklearn.feature_extraction import text #import package

skl_stopwords = text.ENGLISH_STOP_WORDS
#print(skl_stopwords)

from nltk.corpus import stopwords # for excluding the stopwords

#creating an object using the default nltk stopwords
nltk_stopwords = stopwords.words("english")
print(nltk_stopwords)

In [None]:
#remove stop words from latest cleaned transcript
transcribed_list['clean_trans2']=transcribed_list['clean_trans'].apply(lambda x: " ".join(x for x in x.split() if x not in nltk_stopwords))
transcribed_list['clean_trans2'][:6]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Now eliminaing customized stop words
count_vect = CountVectorizer(binary=False, stop_words = nltk_stopwords) 

count_vect_custom_stopwords = count_vect.fit_transform(transcribed_list['clean_trans2'])
print(count_vect_custom_stopwords.shape)
print(count_vect_custom_stopwords)

names = count_vect.get_feature_names()   
count = np.sum(count_vect_custom_stopwords.toarray(), axis = 0) 
count2 = count.tolist() 

#dataframe of output
count_df = pd.DataFrame(count2, index = names, columns = ['Word Count']) 
#  top 20 features by count
count_df.sort_values(['Word Count'], ascending = False)[:20]



In [None]:
featured_words

In [None]:
#attempt Naive Bayes Classifier to predict sentiments

featured_words = list(all_words)[:1000]
print(featured_words)

def document_features(document):
    document_words = set(document)
    features = {}
    for word in featured_words:
        features['contains({})'.format(word)] = (word in document_words)
    return features


#featuresets = [(document_features(d), c) for (d,c) in transcribed_list['clean_trans2']]
#train_set, test_set = featuresets[200:], featuresets[:200]
train_set, test_set = featured_words[100:], featured_words[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
featuresets = transcribed_list['clean_trans2']
train_set, test_set = featuresets[200:], featuresets[:200]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(5)