In [8]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.svm import SVC
import re

In [9]:
# Load training dataset
df= pd.read_csv("gold-dataset-sinha-khandait.csv")
df

Unnamed: 0,Dates,URL,News,Price Direction Up,Price Direction Constant,Price Direction Down,Asset Comparision,Past Information,Future Information,Price Sentiment
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",0,0,1,0,1,0,negative
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,0,0,1,0,1,0,negative
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,0,0,0,1,0,positive
3,28-02-2018,https://www.metalsdaily.com/link/277199/dent-r...,dent research : is gold's day in the sun comin...,0,0,0,0,0,1,none
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",0,0,1,0,1,0,negative
...,...,...,...,...,...,...,...,...,...,...
11407,07-01-2013,https://www.moneycontrol.com/news/business/mar...,gold seen falling from 3-week high this week,0,0,1,0,1,0,negative
11408,27-09-2018,https://www.metalsdaily.com/link/284468/domini...,dominic frisby : now looks like a good time to...,0,0,0,0,0,1,none
11409,03-03-2017,https://www.thehindubusinessline.com/markets/g...,Gold heading for worst week since November on ...,0,0,1,0,1,0,negative
11410,11-06-2008,http://www.marketwatch.com/story/august-gold-u...,august gold up $7.60 at $878.80 an ounce on nymex,1,0,0,0,1,0,positive


In [3]:
# The Price Sentiment column contains 4 classes (positive, neutral, negative and none)
# Positive denotes upward movement in price
# Negative denotes downward movement in price
# Neutral denotes sideways (steady) movement in price
# None denotes that no assessment about prices can be made from the news headline

In [10]:
df = df[df["Price Sentiment"] != 'none']
df

Unnamed: 0,Dates,URL,News,Price Direction Up,Price Direction Constant,Price Direction Down,Asset Comparision,Past Information,Future Information,Price Sentiment
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",0,0,1,0,1,0,negative
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,0,0,1,0,1,0,negative
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,0,0,0,1,0,positive
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",0,0,1,0,1,0,negative
5,16-08-2016,http://www.marketwatch.com/story/dec-gold-clim...,"Dec. gold climbs $9.40, or 0.7%, to settle at ...",1,0,0,0,1,0,positive
...,...,...,...,...,...,...,...,...,...,...
11406,23-12-2016,http://www.marketwatch.com/story/february-gold...,"February gold up $3.70, or 0.3%, at $1,134.40/oz.",1,0,0,0,1,0,positive
11407,07-01-2013,https://www.moneycontrol.com/news/business/mar...,gold seen falling from 3-week high this week,0,0,1,0,1,0,negative
11409,03-03-2017,https://www.thehindubusinessline.com/markets/g...,Gold heading for worst week since November on ...,0,0,1,0,1,0,negative
11410,11-06-2008,http://www.marketwatch.com/story/august-gold-u...,august gold up $7.60 at $878.80 an ounce on nymex,1,0,0,0,1,0,positive


In [11]:
# headlines cleaner
def cleaner(impure_data):
    temp_list = []
    for item in impure_data:
        #finding words which start with @
        item = re.sub('@\S+', '', item)
        
        #finding words which start with http
        item = re.sub('http\S+\s*', '', item)
        
        #finding special characters, but not "emoji"
        item = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', item)
        temp_list.append(item)
    return temp_list

In [12]:
# a simple SVM model with tfidf vectorizer
def headline_sentiment(df):
    headlines = df["News"]
    polarity = df["Price Sentiment"].tolist()
    
    #cleaning headlines i.e. removing @mentions, http(s) links and special characters such as punctuations
    clean_headline = cleaner(headlines)
    
    #initializing tf-idf vectorizer
    tf_idfvectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    
    #splitting the data into train and test dataset in 70 : 30 ratio at random
    X_train, X_test, Y_train, Y_test = train_test_split(clean_headline, polarity, test_size = 0.3)
    
    train_corpus_tf_idf = tf_idfvectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = tf_idfvectorizer.transform(X_test)
    
    #using SVC package to initialize a classifier with Linear kernel and other default parameters
    SVM_L = SVC(kernel= 'linear')
    
    #fitting the sparse matrix in the classifier with their respective sentiments
    SVM_L.fit(train_corpus_tf_idf, Y_train)
    
    #predicting the sentiments for the test dataset
    Y_pred = SVM_L.predict(test_corpus_tf_idf)
    
    #this prints accuracy score for the test dataset
    print("Testing Accuracy:",accuracy_score(Y_test,Y_pred))
    
    #this prints confusion matrix for the test dataset
    labels = np.unique(Y_test)    
    m = confusion_matrix(Y_test,Y_pred, labels=labels)
    print("\nConfusion matrix on test data")
    cm = pd.DataFrame(m, index=labels, columns=labels)
    cm.index = "Actual: " + cm.index
    cm.columns = "Predicted: " + cm.columns
    display(cm)
    
    #saving the data into a csv file in the current folder
    temp_df = pd.DataFrame()
    temp_df["News"] = X_test
    temp_df["Actual Price Sentiment"] = Y_test
    temp_df["Predicted Sentiment"] = Y_pred
    temp_df.to_csv("predicted.csv")
    
    print('Predictions on Test Data are as follows:')
    display(temp_df)

    return(tf_idfvectorizer,SVM_L)

In [13]:
vectorizer,model = headline_sentiment(df)

Testing Accuracy: 0.9226105563480742

Confusion matrix on test data


Unnamed: 0,Predicted: negative,Predicted: neutral,Predicted: positive
Actual: negative,1120,8,88
Actual: neutral,21,84,26
Actual: positive,66,8,1383


Predictions on Test Data are as follows:


Unnamed: 0,News,Actual Price Sentiment,Predicted Sentiment
0,Gold steady investors await cues from Fed,neutral,neutral
1,Gold Prices Slip for Two Consecutive Days on S...,negative,negative
2,gold ends up 08 platinum palladium rally,positive,positive
3,Gold drops on upbeat US data Fed language,negative,negative
4,gold futures at fresh twoweek high,positive,positive
...,...,...,...
2799,gold silver futures continue lower in afternoo...,negative,negative
2800,gains for gold futures intensify as us dollar ...,positive,negative
2801,august gold trades at 90070oz up 920 or 1 in ny,positive,positive
2802,april gold climbs 4 to 65550oz in morning trading,positive,positive


In [14]:
#Trying sample headlines
vector = vectorizer.transform(["Gold expected to beat expectations."])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [15]:
#Load data
data=pd.read_json("scraped_articles.json")
data

Unnamed: 0,newspapers
Google_news_Chile,{'rss': 'https://news.google.com/rss/search?q=...
Google_news_Norway,{'rss': 'https://news.google.com/rss/search?q=...
Google_news_Peru,{'rss': 'https://news.google.com/rss/search?q=...


In [16]:
articles=[pd.DataFrame(data['newspapers']['Google_news_Chile']['articles']),
pd.DataFrame(data['newspapers']['Google_news_Norway']['articles']),
pd.DataFrame(data['newspapers']['Google_news_Peru']['articles'])]
news=pd.concat(articles,ignore_index=True)
news

Unnamed: 0,link,published,title,text
0,https://www.digitaljournal.com/pr/fish-oil-mar...,2022-08-15T11:07:03,Fish Oil Market to Witness a Pronounce Growth ...,Fish Oil Market 2022 research provides accurat...
1,https://www.einnews.com/pr_news/583589445/fish...,2022-07-29T11:33:00,"Fish Oil Market Size, Share, Growth, Analysis,...",The global fish oil market is expected to exhi...
2,https://www.signatureluxurytravel.com.au/best-...,2022-08-15T09:37:30,Bon Appétit: The World’s 50 Best Restaurants a...,40. Schloss Schauenstein – Fürstenau\n\nSchlos...
3,https://www.washingtonpost.com/food/2022/07/25...,2022-07-25T08:00:00,Tom Sietsema’s 7 favorite places to eat right now,Placeholder while article actions load\n\nWith...
4,https://www.oregonlive.com/travel/2022/07/the-...,2022-07-22T08:00:00,The Gold Room serves fresh pizza from a beauti...,"Don’t look now, but The Gold Room is quickly b..."
5,https://www.bakersfield.com/entertainment/food...,2022-08-05T08:00:00,THE DISH: Pizza my heart: You must try these l...,"Thank you for reading!\n\nPlease log in, or si..."
6,https://www.perthnow.com.au/lifestyle/heres-wh...,2022-08-03T08:00:00,Here’s where to taste Akoya this winter,You’ve seen the Leeuwin Coast Akoya feature on...
7,https://www.salon.com/2022/07/17/a-cheesy-omel...,2022-07-17T08:00:00,A cheesy omelet duvet with gochujang fried ric...,This recipe is brought to you from the mystic ...
8,https://www.seafoodsource.com/news/supply-trad...,2022-07-27T20:14:07,Peru closes north-central region anchovy seaso...,Peru closes north-central region anchovy seaso...
9,https://www.naturalhistorymag.com/features/013...,2022-08-01T17:38:07,Natural History Magazine,Features: July-August 2022\n\nSalmon The way t...


In [17]:
news['title'][0]

'Fish Oil Market to Witness a Pronounce Growth during 2022-2028 by Top Key Players BASF SE, Camanchaca SA, China Fishery Group Limited, Copeinca ASA'

In [18]:
#Trying on our data
sentiment_list=[]
for item in news['title']:
    vector = vectorizer.transform([item])
    sentiment = model.predict(vector)
    sentiment_list.append(sentiment[0])
    print(item,sentiment)

Fish Oil Market to Witness a Pronounce Growth during 2022-2028 by Top Key Players BASF SE, Camanchaca SA, China Fishery Group Limited, Copeinca ASA ['positive']
Fish Oil Market Size, Share, Growth, Analysis, Price, Trends and Forecast 2022-2027 ['positive']
Bon Appétit: The World’s 50 Best Restaurants announced ['positive']
Tom Sietsema’s 7 favorite places to eat right now ['negative']
The Gold Room serves fresh pizza from a beautiful space in remote northeast Oregon ['positive']
THE DISH: Pizza my heart: You must try these local pies ['positive']
Here’s where to taste Akoya this winter ['positive']
A cheesy omelet duvet with gochujang fried rice that oozes warmth and comfort ['negative']
Peru closes north-central region anchovy season at just 84 percent of TAC ['positive']
Natural History Magazine ['positive']
Rabobank: Fishmeal market finely balanced amid reduced Chinese demand ['negative']
Salmon Farming’s Dirty Business • The Revelator ['positive']
3 Reasons to Avoid Farmed Salmon 

In [20]:
sentiment_list

['positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative']

In [21]:
news['Price Sentiment']=sentiment_list
news

Unnamed: 0,link,published,title,text,Price Sentiment
0,https://www.digitaljournal.com/pr/fish-oil-mar...,2022-08-15T11:07:03,Fish Oil Market to Witness a Pronounce Growth ...,Fish Oil Market 2022 research provides accurat...,positive
1,https://www.einnews.com/pr_news/583589445/fish...,2022-07-29T11:33:00,"Fish Oil Market Size, Share, Growth, Analysis,...",The global fish oil market is expected to exhi...,positive
2,https://www.signatureluxurytravel.com.au/best-...,2022-08-15T09:37:30,Bon Appétit: The World’s 50 Best Restaurants a...,40. Schloss Schauenstein – Fürstenau\n\nSchlos...,positive
3,https://www.washingtonpost.com/food/2022/07/25...,2022-07-25T08:00:00,Tom Sietsema’s 7 favorite places to eat right now,Placeholder while article actions load\n\nWith...,negative
4,https://www.oregonlive.com/travel/2022/07/the-...,2022-07-22T08:00:00,The Gold Room serves fresh pizza from a beauti...,"Don’t look now, but The Gold Room is quickly b...",positive
5,https://www.bakersfield.com/entertainment/food...,2022-08-05T08:00:00,THE DISH: Pizza my heart: You must try these l...,"Thank you for reading!\n\nPlease log in, or si...",positive
6,https://www.perthnow.com.au/lifestyle/heres-wh...,2022-08-03T08:00:00,Here’s where to taste Akoya this winter,You’ve seen the Leeuwin Coast Akoya feature on...,positive
7,https://www.salon.com/2022/07/17/a-cheesy-omel...,2022-07-17T08:00:00,A cheesy omelet duvet with gochujang fried ric...,This recipe is brought to you from the mystic ...,negative
8,https://www.seafoodsource.com/news/supply-trad...,2022-07-27T20:14:07,Peru closes north-central region anchovy seaso...,Peru closes north-central region anchovy seaso...,positive
9,https://www.naturalhistorymag.com/features/013...,2022-08-01T17:38:07,Natural History Magazine,Features: July-August 2022\n\nSalmon The way t...,positive
