In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.svm import SVC
import re

In [3]:
df = pd.read_csv("/content/gold-dataset-sinha-khandait.csv")

In [5]:
df = df[df["Price Sentiment"] != 'none']

In [6]:
print("Commodity News Headlines")
display(df[["News","Price Sentiment"]])

Commodity News Headlines


Unnamed: 0,News,Price Sentiment
0,"april gold down 20 cents to settle at $1,116.1...",negative
1,gold suffers third straight daily decline,negative
2,Gold futures edge up after two-session decline,positive
4,"Gold snaps three-day rally as Trump, lawmakers...",negative
5,"Dec. gold climbs $9.40, or 0.7%, to settle at ...",positive
...,...,...
11406,"February gold up $3.70, or 0.3%, at $1,134.40/oz.",positive
11407,gold seen falling from 3-week high this week,negative
11409,Gold heading for worst week since November on ...,negative
11410,august gold up $7.60 at $878.80 an ounce on nymex,positive


In [7]:
def cleaner(impure_data):
    temp_list = []
    for item in impure_data:
        #finding words which start with @
        item = re.sub('@\S+', '', item)

        #finding words which start with http
        item = re.sub('http\S+\s*', '', item)

        #finding special characters, but not "emoji"
        item = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', item)
        temp_list.append(item)
    return temp_list

In [8]:
def headline_sentiment(df):
    headlines = df["News"]
    polarity = df["Price Sentiment"].tolist()

    #cleaning headlines i.e. removing @mentions, http(s) links and special characters such as punctuations
    clean_headline = cleaner(headlines)

    #initializing tf-idf vectorizer
    tf_idfvectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)

    #splitting the data into train and test dataset in 70 : 30 ratio at random
    X_train, X_test, Y_train, Y_test = train_test_split(clean_headline, polarity, test_size = 0.3)
    train_corpus_tf_idf = tf_idfvectorizer.fit_transform(X_train)
    test_corpus_tf_idf = tf_idfvectorizer.transform(X_test)

    #using SVC package to initialize a classifier with Linear kernel and other default parameters
    SVM_L = SVC(kernel= 'linear')

    #fitting the sparse matrix in the classifier with their respective sentiments
    SVM_L.fit(train_corpus_tf_idf, Y_train)

    #predicting the sentiments for the test dataset
    Y_pred = SVM_L.predict(test_corpus_tf_idf)

    #this prints accuracy score for the test dataset
    print("Testing Accuracy:",accuracy_score(Y_test,Y_pred))

    #this prints confusion matrix for the test dataset
    labels = np.unique(Y_test)
    m = confusion_matrix(Y_test,Y_pred, labels=labels)
    print("\nConfusion matrix on test data")
    cm = pd.DataFrame(m, index=labels, columns=labels)
    cm.index = "Actual: " + cm.index
    cm.columns = "Predicted: " + cm.columns
    display(cm)

    #saving the data into a csv file in the current folder
    temp_df = pd.DataFrame()
    temp_df["News"] = X_test
    temp_df["Actual Price Sentiment"] = Y_test
    temp_df["Predicted Sentiment"] = Y_pred
    temp_df.to_csv("predicted.csv")

    print('Predictions on Test Data are as follows:')
    display(temp_df)

    return(tf_idfvectorizer,SVM_L)

In [9]:
vectorizer,model = headline_sentiment(df)


Testing Accuracy: 0.927960057061341

Confusion matrix on test data


Unnamed: 0,Predicted: negative,Predicted: neutral,Predicted: positive
Actual: negative,1193,9,84
Actual: neutral,16,89,27
Actual: positive,62,4,1320


Predictions on Test Data are as follows:


Unnamed: 0,News,Actual Price Sentiment,Predicted Sentiment
0,Gold prices settle at a nearly 3week high,positive,positive
1,gold futures rise above 1600 an ounce,positive,positive
2,june gold settles 11 higher at 123620 an ounce,positive,positive
3,gold reverses losses on equity econ concerns stay,negative,negative
4,gold silver turn higher as ukraine tension boo...,positive,positive
...,...,...,...
2799,Gold prices to trade positive Angel Commodities,positive,positive
2800,Gold futures tip lower as the US dollar streng...,negative,negative
2801,gold settles slightly higher after touching mu...,positive,positive
2802,Dec gold settles at 125660oz up 110 or 01,positive,positive


In [10]:
vector = vectorizer.transform(["Gold expected to beat expectations."])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [11]:
vector = vectorizer.transform(["The price of gold continues declining."])
sentiment = model.predict(vector)
print(sentiment)

['negative']


In [12]:
vector = vectorizer.transform(["Gold price continues to improve."])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [13]:
vector = vectorizer.transform(["Gold price expected to remain steady."])
sentiment = model.predict(vector)
print(sentiment)


['neutral']
