This notebook is a sentiment analysis on GE transcript calls
Sentiment Analysis on publically traded companies earnings conference calls , to get an overall sentiment of the earnings calls from GE (General Electric) company executives.

The two conference calls are:

Title: General Electric Company (GE) Presents At Electrical Products Group Conference (Transcript) 
Date of document: May 23, 2018 4:42 PM ET 

Title: General Electric's (GE) CEO John Flannery on Q1 2018 Results - Earnings Call Transcript 
Date of document: Apr. 20, 2018 3:04 PM ET 

# The Web Scrape

This file web scrapes Alpha Vantage for GE Earnings calls.  I picked GE because it has been (or was) a solid
company for the last 30+ years. 

This program scrapes the site using a random User-Agent from a list (300 User-Agents) from Chrome, Firefox and IE

In [7]:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
from urllib.request import Request
from time import sleep
import pandas as pd
import random

## Base URL >> Seeking Alpha
url_base = "https://seekingalpha.com/"
## Dictonary >> of the rest of the urls >> for transcripts
trans_dict = {"GE_conf":"article/4176676-general-electric-company-ge-presents-electrical-products-group-conference-transcript",
             "GE_earn_Q1_18":"article/4164470-general-electrics-ge-ceo-john-flannery-q1-2018-results-earnings-call-transcript"}

url = url_base + list(trans_dict.values())[0]


### Function to clean up HTML tags inside body
def html_clean(text):
    cleaner = re.compile('<[^<]+?>')    
    cleaned_text = re.sub(cleaner, "", text)
    return cleaned_text

### Text File writing data to 
file_name = "seek_alpha_ws_test.txt"
f=open(file_name, "w")
#doc_title = "This is a test to scrape seeking alpha\n"
doc_title = "+-+-+-+-+-+-+-+-+-+\n"
f.write(doc_title)

#### Reading in the list of user agents
user_agent_list = pd.read_csv("user_agents.txt", delimiter = "|")
user_agent_list = list(user_agent_list.iloc[:,0])

### Used to get articles Articles >> Which would be like containers
req = Request(url, headers={
    'User-Agent': (random.choice(user_agent_list))})
uClient = uReq (req)  # sends GET request to URL
page_html = uClient.read ()  # reads returned data and puts it in a variable
uClient.close ()  # close the connection
page_soup = soup(page_html, "html.parser")
articles = page_soup.find("article") # For article body


#### This loop goes through the different url's in dictonrary
for key in range(len(trans_dict.values())):
    my_url = url_base + list(trans_dict.values())[key]
    print(my_url)
    req = Request(my_url, headers={
    'User-Agent': (random.choice(user_agent_list))})
    uClient = uReq (req)  # sends GET request to URL
    page_html = uClient.read ()  # reads returned data and puts it in a variable
    uClient.close ()  # close the connection
    ### Getting Articles >> Which would be like containers
    page_soup = soup(page_html, "html.parser")
    articles = page_soup.find("article") # For article body
    for article in articles:

        title_container = articles.find("h1", {"itemprop":"headline"}) # Title container
        title = title_container.text # Extracts title from container
    
        time_container = articles.find("time", {"itemprop":"datePublished"}) # Time container
        time = time_container.text # Extracts the Date and Time article was published
    
        body_container = articles.findAll("p")
        body = str(body_container) # Converts body_container resultset >> to a string
        body = html_clean(body) # Cleaning out the HTML tags
        body = body.replace("&amp;", "&")
        
    f.write("Title: \n"+title+"\nDate of document: \n"+time+"\nContent: \n"+body+"\n+-+-+-+-+-+-+-+-+-+\n")
    sleep(7)

f.close()

https://seekingalpha.com/article/4176676-general-electric-company-ge-presents-electrical-products-group-conference-transcript
https://seekingalpha.com/article/4164470-general-electrics-ge-ceo-john-flannery-q1-2018-results-earnings-call-transcript


# The Sentiment Analysis

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as SIA
import re


analyzer = SIA()

#### Reading the whole documnet >> VaderSent reads the whole documnet >> Creates a list that stores sentiment values
sent_sentence = []
doc_line_count = []
#doc_list = []
file = "seek_alpha_ws_test.txt"
with open(file, "r") as fileinput:
    #sent_sentence = []
    for line in fileinput:
        #if re.search("Content:", line):
        if re.search("\[", line):
            line_count = None
            line = line.rstrip() # Strips the white space
            split_line = line.split(".")
            line_count = len(split_line)
            doc_line_count.append(line_count)
            for sent_line in split_line:
                sentiment = analyzer.polarity_scores(sent_line)
                sent_sentence.append(sentiment)
                #doc_list.append(sent_sentence)


#### Getting the docs from sent_sentence >> indexing by doc_line_count
ge_doc_1 = sent_sentence[0:446] #General Electric Company (GE) Presents At Electrical Products Group Conference (Transcript)
ge_doc_2 = sent_sentence[446:] #General Electric's (GE) CEO John Flannery on Q1 2018 Results - Earnings Call Transcript


#### This function is a loop that breaks down the list >> which has a dictonary inside
#### It takes the keys and puts each value into a corresponding list

###### For ge_doc_2
def doc_sent(doc):
    positive = []
    negative = []
    neutral = []
    compound = []
    i = 0
    for i in range(len(doc)):
       positive.append(doc[i]["pos"])
       negative.append(doc[i]["neg"])
       neutral.append(doc[i]["neu"])
       compound.append(doc[i]["compound"])
    
    
    ### Gets an average for the corresponding sentiment
    positive = sum(positive)/len(positive)
    negative = sum(negative)/len(negative)
    neutral = sum(neutral)/len(neutral)
    compound = sum(compound)/len(compound)
    
    ### Rounds the sentiment and turns it into a percent
    positive = round((positive *100),2)
    negative = round((negative *100),2)
    neutral = round((neutral *100),2)
    compound = round((compound *100),2)
    positive = "Positive: ", positive
    negative = "Negative: ", negative
    neutral = "Neutral: ", neutral
    compound = "Compound: ", compound
    return positive,negative,neutral,compound

## Creating variables that hold the sentiment from each corresponding document from the doc_sent function
GE_doc_1_sentiment = doc_sent(ge_doc_1)
GE_doc_2_sentiment = doc_sent(ge_doc_2)

## Seeing the results for each documents sentiment
print(GE_doc_1_sentiment)
print(GE_doc_2_sentiment)

(('Positive: ', 9.88), ('Negative: ', 1.3), ('Neutral: ', 88.82), ('Compound: ', 19.43))
(('Positive: ', 10.0), ('Negative: ', 2.1), ('Neutral: ', 86.62), ('Compound: ', 17.16))
