In [None]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
from scipy.stats import pearsonr
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger('fbprophet').setLevel(logging.WARNING)

%matplotlib inline
init_notebook_mode(connected = True)

In [114]:
class PPPredictor:
    
    def __init__(self):
        self.model = Prophet()
    
    def prepare_data(self, data, actual_date):
        df = data.copy()
        df.index = pd.to_datetime(df['Date'])
        df.drop(['Date'], axis=1, inplace=True)
        for col in df.columns:
            df[col].interpolate(method='time', inplace=True)
        df.index = pd.to_datetime(df.index)
        return df[df.index <= pd.to_datetime(actual_date)]

    def fit(self, prepared_data, use_text_model=False):
        prepared_df = pd.DataFrame({'ds': prepared_data.index, 'y': prepared_data['PPSpotAvgPrice']}).reset_index().drop(['Date'], axis=1)
        self.model.fit(prepared_df)
        return self.model
    
    def predict(self, date):
        date = pd.to_datetime(date)
        forecast = self.model.predict(pd.DataFrame({'ds': [date]}))
        return forecast['yhat'][0]

In [171]:
 def get_next_monday(df, date):
    first = True
    while len(df[df.index == str(date).split()[0]]) == 0:
        if first:
            date += timedelta(days=((7 - date.weekday()) % 7))
            first = False
        else:
            date += timedelta(7)
    return date

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def test_sol(date):
    df = pd.read_csv('./data/retrieved_data.csv')
    ppp = PPPredictor()
    prepared = ppp.prepare_data(df, date)
    ppp.fit(prepared)
    
    df.index = pd.to_datetime(df['Date'])
    df.drop(['Date'], axis=1, inplace=True)
    for col in df.columns:
        df[col].interpolate(method='time', inplace=True)
    
    start = pd.to_datetime(date) + timedelta(3 * 30)
    end = pd.to_datetime(date) + timedelta(4 * 30)
    pred = []
    actual = []
    for dt in pd.date_range(start, end, freq='W'):
        pred.append(ppp.predict(str(dt).split()[0]))
        actual_dt = get_next_monday(df, dt)
        right_val = df[df.index == str(actual_dt).split()[0]]['PPSpotAvgPrice'].iloc[0]
        actual.append(right_val)
    print('MAPE for {}: {:.2f}%'.format(date, mean_absolute_percentage_error(pred, actual)))
    

In [172]:
for dt in pd.date_range(start=pd.to_datetime('2013-12-01'), 
                        end=pd.to_datetime('2018-07-01'), freq='M'):
    test_sol(str(dt).split()[0])

MAPE for 2013-12-31: 26.24%
MAPE for 2014-01-31: 23.48%
MAPE for 2014-02-28: 32.62%
MAPE for 2014-03-31: 11.96%
MAPE for 2014-04-30: 10.26%
MAPE for 2014-05-31: 6.99%
MAPE for 2014-06-30: 11.23%
MAPE for 2014-07-31: 13.65%
MAPE for 2014-08-31: 17.79%
MAPE for 2014-09-30: 22.17%
MAPE for 2014-10-31: 15.65%
MAPE for 2014-11-30: 3.12%
MAPE for 2014-12-31: 27.55%
MAPE for 2015-01-31: 54.70%
MAPE for 2015-02-28: 65.78%
MAPE for 2015-03-31: 45.31%
MAPE for 2015-04-30: 22.76%
MAPE for 2015-05-31: 10.54%
MAPE for 2015-06-30: 22.29%
MAPE for 2015-07-31: 17.63%
MAPE for 2015-08-31: 17.99%
MAPE for 2015-09-30: 16.49%
MAPE for 2015-10-31: 11.26%
MAPE for 2015-11-30: 54.29%
MAPE for 2015-12-31: 43.75%
MAPE for 2016-01-31: 18.47%
MAPE for 2016-02-29: 3.56%
MAPE for 2016-03-31: 3.40%
MAPE for 2016-04-30: 1.32%
MAPE for 2016-05-31: 8.98%
MAPE for 2016-06-30: 17.54%
MAPE for 2016-07-31: 26.42%
MAPE for 2016-08-31: 30.91%
MAPE for 2016-09-30: 31.63%
MAPE for 2016-10-31: 31.22%
MAPE for 2016-11-30: 5.11%

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

f = open("news-links.txt", 'r')

db = pd.DataFrame(columns=['SiteName', 'Text', 'Date'])


links = f.readlines()

for link in links:
    if link.replace(' ', '') == '':
        continue
    print('link -> ', link)
    text = requests.get(link[:-1]).text
    # print(text)
    soup = BeautifulSoup(text, "html.parser")
    # print(soup.prettify())
    # print(soup)
    texts = soup.findAll("div", "StandardArticleBody_body")[0].text
    #print(texts)
    t = soup.find("div", "ArticleHeader_date").text
    raw_date = t[:t.find('/')]
    date = datetime.strptime(raw_date, "%B %d, %Y ")
    article_date = datetime.strftime(date, "%d-%m-%Y")
    print(article_date)
    db = db.append({'SiteName': "https://www.reuters.com",'Text': texts,'Date': article_date}, ignore_index=True)

db.to_csv("./data/retrieved_articles.csv")


link ->  https://www.reuters.com/article/brief-exxonmobil-considers-polypropylene/brief-exxonmobil-considers-polypropylene-production-expansion-along-u-s-gulf-coast-idUSFWN1R20N2

20-03-2018
link ->  https://www.reuters.com/article/brief-jacobs-awarded-contract-to-complet/brief-jacobs-awarded-contract-to-complete-feasibility-study-for-expansion-of-borealis-polypropylene-plants-in-belgium-idUSFWN1Q30YY

13-02-2018
link ->  https://www.reuters.com/article/brief-srf-commissions-first-phase-of-bi/brief-srf-commissions-first-phase-of-bi-axially-oriented-polypropylene-film-line-metallizer-project-idUSFWN1OT032

29-12-2017
link ->  https://www.reuters.com/article/brief-braskem-approves-construction-of-p/brief-braskem-approves-construction-of-polypropylene-production-line-idUSFWN1JJ0F8

22-06-2017
link ->  https://www.reuters.com/article/brief-inter-pipeline-to-build-canadas-fi/brief-inter-pipeline-to-build-canadas-first-integrated-propane-dehydrogenation-and-polypropylene-complex-idUSFWN1OI0K

23-04-2012
link ->  https://www.reuters.com/article/dowchemical-assetsales/dow-chemical-looking-to-raise-1-5-billion-from-asset-sales-idUSL3N0C62JS20130314

14-03-2013
link ->  https://www.reuters.com/article/us-tanzania-agriculture-foodsecurity/pest-proof-bags-and-bins-slim-tanzanias-lean-season-idUSKCN1LN1MJ

07-09-2018
link ->  https://www.reuters.com/article/tanzania-agriculture-foodsecurity/corrected-feature-pest-proof-bags-and-bins-slim-tanzanias-lean-season-idUSL5N1UM7PP

07-09-2018
link ->  https://www.reuters.com/article/dowchemical-assetsales/update-1-dow-chemical-looking-to-raise-1-5-bln-from-asset-sales-idUSL3N0C62LU20130314

14-03-2013
link ->  https://www.reuters.com/article/idUSFWN1C30LG

27-09-2016
link ->  https://www.reuters.com/article/india-press/india-press-reliance-industries-to-build-towers-for-4g-ops-economic-times-idUSL3E8FN1L720120423

23-04-2012
link ->  https://www.reuters.com/article/china-futures-dalian/chinas-dalian-exchange-raises-transaction-fees-for-ir

MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant http://?

In [6]:
!pip install bs4 --user

Collecting bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
