<a href="https://colab.research.google.com/github/Gaukhar-ai/for_my_Thinkful_work/blob/master/OIL_companies_Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [5]:
# Parameters 
n = 5 #the # of article headlines displayed per ticker
tickers = ['SNP', 'PTR', 'BP', 'XOM', 'TOT', 'CVX', 'MPC']

In [6]:
# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

In [7]:
for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass



Recent News Headlines for SNP: 
Sinopec's Net Profit for 2020 Q3 Reached RMB 23.507 Billion Campaign of Continuously Tiding Over Difficulties and Improving Performances Achieves Favorable Results with Significant Improvement of Operation and Profitability ( Oct-28-20 11:05PM )
China's Sinopec flips to quarterly profit on robust refining business ( 06:44AM )
China Recruits a South Korean Conglomerate to Advise on ESG ( Oct-18-20 10:25PM )
Sinopec Group Overseas Development (2015) Ltd -- Moody's affirms A1 ratings of Sinopec Group and Sinopec Corp; outlook stable ( Oct-16-20 07:20PM )
U.S. State Department may add Ant Group to trade blacklist: RPT ( 03:11PM )


Recent News Headlines for PTR: 
PetroChina (PTR) Q3 Earnings Jump on Pipeline Spin-Off Gains ( Nov-09-20 08:32AM )
PetroChina Company Limited (PTR) Q3 2020 Earnings Call Transcript ( Oct-30-20 11:01PM )
Zero Hour Is Coming for Emissions. Believe It ( Oct-25-20 06:00PM )
PetroChina Company (PTR) Enters Oversold Territory ( Oct-21

In [8]:
 #Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text = x.a.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text])

In [9]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [10]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [11]:
# View Data 
news['Date'] = pd.to_datetime(news.Date).dt.date

In [12]:
unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

In [13]:

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
SNP     2020-10-28  11:05PM  0.055  0.521  0.424    0.9169
SNP     2020-10-28  06:44AM  0.000  0.602  0.398    0.6486
SNP     2020-10-18  10:25PM  0.000  1.000  0.000    0.0000
SNP     2020-10-16  07:20PM  0.000  0.891  0.109    0.2960
SNP     2020-10-16  03:11PM  0.000  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
PTR     2020-11-09  08:32AM  0.000  0.769  0.231    0.3400
PTR     2020-10-30  11:01PM  0.192  0.808  0.000   -0.2263
PTR     2020-10-25  06:00PM  0.000  1.000  0.000    0.0000
PTR     2020-10-21  06:36AM  0.000  1.000  0.000    0.0000
PTR     2020-10-16  03:11PM  0.000  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
BP      2020-11-09  09:42AM  0.000  1.000  0.000  

In [14]:
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



        Mean Sentiment
Ticker                
TOT               0.13
CVX               0.03
SNP               0.01
BP                0.01
MPC               0.00
PTR              -0.02
XOM              -0.07
