<a href="https://colab.research.google.com/github/Gaukhar-ai/DecisionTree/blob/master/OIL_companies_Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import nltk
import spacy
import re



In [2]:
# Get Data
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

In [3]:
# Parameters 
n = 5 #the # of article headlines displayed per ticker
tickers = ['SNP', 'PTR', 'BP', 'XOM', 'TOT', 'CVX', 'MPC']

#particularly selected OIL mining companies, since the market is very unique these times,
#lots of opportunities in the oil market

In [4]:
#cleaning the data by using BeautifulSoup

for ticker in tickers:
    url = finviz_url + ticker
    req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass



Recent News Headlines for SNP: 
Sinopec (SNP) Jumps More Than 13% on Q3 Earnings Improvement ( Nov-10-20 08:55AM )
China Ready to Pick Up Slack With Global Oil Demand Wavering ( Nov-09-20 10:17PM )
Sinopec's Net Profit for 2020 Q3 Reached RMB 23.507 Billion Campaign of Continuously Tiding Over Difficulties and Improving Performances Achieves Favorable Results with Significant Improvement of Operation and Profitability ( Oct-28-20 11:05PM )
China's Sinopec flips to quarterly profit on robust refining business ( 06:44AM )
China Recruits a South Korean Conglomerate to Advise on ESG ( Oct-18-20 10:25PM )


Recent News Headlines for PTR: 
China Ready to Pick Up Slack With Global Oil Demand Wavering ( Nov-09-20 10:17PM )
PetroChina (PTR) Q3 Earnings Jump on Pipeline Spin-Off Gains ( 08:32AM )
PetroChina Company Limited (PTR) Q3 2020 Earnings Call Transcript ( Oct-30-20 11:01PM )
Zero Hour Is Coming for Emissions. Believe It ( Oct-25-20 06:00PM )
PetroChina Company (PTR) Enters Oversold Ter

In [26]:
#a_text = re.sub(r'Chapter \d+', '', a_text)
#print('Chapter headings removed:', url[0:100])

In [25]:
#url = ' '.join(url.split())

#print('extra whitespace removed:', a_text[0:100])

In [23]:
# Remove newlines and other extra whitespace by splitting and rejoining
#url = ' '.join(a_text.split())
#print('Extra whitespace removed:', url)

In [24]:
#Tokenization
#nlp = spacy.load('en')
#url_doc=nlp(url)

In [9]:
# Explore the objects that you've built.
print("The url_doc object is a {} object.".format(type(url_doc)))
print("It is {} tokens long".format(len(url_doc)))
print("The first three tokens are '{}'".format(url_doc[:3]))
print("The type of each token is {}".format(type(url_doc[0])))

The url_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 12 tokens long
The first three tokens are 'Marathon (MPC'
The type of each token is <class 'spacy.tokens.token.Token'>


In [10]:
#removing stop words:
url_without_stopwords = [token for token in url_doc if not token.is_stop]


In [11]:
# Utility function to calculate how frequently words appear in the text
def word_frequencies(text):
    
    # Build a list of words
    # Strip out punctuation
    words = []
    for token in text:
        if not token.is_punct:
            words.append(token.text)
            
    # Build and return a `Counter` object containing word counts
    return Counter(words)

# Instantiate your list of the most common words
url_word_freq = word_frequencies(url_without_stopwords).most_common(10)
print('\nurl:', url_word_freq)


url: [('Marathon', 1), ('MPC', 1), ('Q3', 1), ('Loss', 1), ('Narrower', 1), ('Expected', 1), ('Sales', 1), ('Miss', 1)]


In [12]:
 #Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text = x.a.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text])

In [13]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
from nltk import sent_tokenize
sents = sent_tokenize(url)

In [16]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [17]:
# View Data 
news['Date'] = pd.to_datetime(news.Date).dt.date

In [18]:
unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

In [19]:

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
SNP     2020-11-10  08:55AM  0.000  0.750  0.250    0.4588
SNP     2020-11-09  10:17PM  0.228  0.588  0.184    0.1027
SNP     2020-10-28  11:05PM  0.055  0.521  0.424    0.9169
SNP     2020-10-28  06:44AM  0.000  0.602  0.398    0.6486
SNP     2020-10-18  10:25PM  0.000  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
PTR     2020-11-09  10:17PM  0.228  0.588  0.184    0.1027
PTR     2020-11-09  08:32AM  0.000  0.769  0.231    0.3400
PTR     2020-10-30  11:01PM  0.192  0.808  0.000   -0.2263
PTR     2020-10-25  06:00PM  0.000  1.000  0.000    0.0000
PTR     2020-10-21  06:36AM  0.000  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
BP      2020-11-10  12:18PM  0.106  0.794  0.101  

In [20]:
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



        Mean Sentiment
Ticker                
TOT               0.13
CVX               0.03
SNP               0.02
BP                0.01
MPC               0.00
PTR              -0.02
XOM              -0.07
