## Import Data

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mandarin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Mandarin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
df = pd.read_csv("News Sentiment Analysis for Stock Data by Company.csv", encoding="latin-1")
df

Unnamed: 0,Label,Ticker,Headline
0,0,A,@TotesTravel : Airline shares tumble as New Yo...
1,1,A,@TotesTravel : American United call off Hong K...
2,0,A,@TotesTravel : U.S. airline stocks hit highest...
3,1,A,@TotesTravel : American Airlines reaches deal ...
4,1,A,@TotesTravel : US airlines Treasury Department...
...,...,...,...
15557,0,WMT,Walmart dumps e-cigarettes: Largest store in U...
15558,0,WMT,Walmart makes a $16 billion bet on India's boo...
15559,0,WMT,Walmart raises minimum age to buy tobacco to 2...
15560,0,WMT,Walmart Took Over Chile In Only Three Years An...


### SVM (Support Vector Machine)

#### SVM Models Preparation

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [4]:
def clean_message(message):
    '''
    Input:
        message: a string containing a message.
    Output:
        messages_cleaned: a list of words containing the processed message.
    '''
    #remove http(s)://+string, replace with nothing
    remove_link = re.sub(r'https?://\S+', '', message)
    #remove any non-word and non-space character
    remove_punct = re.sub(r'[^\w\s]','',remove_link)
    lower_message = remove_punct.lower()

    #tokenize words, then append the non-stopwords to messages_cleaned
    tokenized = word_tokenize(lower_message)
    remove_stopwds = [word for word in tokenized if not word in stopwords.words('english')]

    #Porter stemming
    stemmer = PorterStemmer()
    messages_cleaned = [stemmer.stem(word) for word in remove_stopwds]
    return messages_cleaned

In [5]:
# Use a list to store separate words of cleaned headlines
cleaned_headlines = []

# Add cleaned headlines to data frame as strings.
for i in range(len(df)):
    tokens = clean_message(df.loc[i, "Headline"])
    df.loc[i, 'Cleaned_Headline'] = " ".join(tokens)
    cleaned_headlines.append(tokens)

In [6]:
df

Unnamed: 0,Label,Ticker,Headline,Cleaned_Headline
0,0,A,@TotesTravel : Airline shares tumble as New Yo...,totestravel airlin share tumbl new york impos ...
1,1,A,@TotesTravel : American United call off Hong K...,totestravel american unit call hong kong fligh...
2,0,A,@TotesTravel : U.S. airline stocks hit highest...,totestravel us airlin stock hit highest price ...
3,1,A,@TotesTravel : American Airlines reaches deal ...,totestravel american airlin reach deal boe 737...
4,1,A,@TotesTravel : US airlines Treasury Department...,totestravel us airlin treasuri depart reach ag...
...,...,...,...,...
15557,0,WMT,Walmart dumps e-cigarettes: Largest store in U...,walmart dump ecigarett largest store us longer...
15558,0,WMT,Walmart makes a $16 billion bet on India's boo...,walmart make 16 billion bet india boom economi
15559,0,WMT,Walmart raises minimum age to buy tobacco to 2...,walmart rais minimum age buy tobacco 21 pressu...
15560,0,WMT,Walmart Took Over Chile In Only Three Years An...,walmart took chile three year countri terrifi ...


#### SVM Implementation

In [7]:
np.random.seed(300)

In [8]:
#Split the cleaned data into train and test sets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['Cleaned_Headline'], df['Label'], test_size=0.3)

In [20]:
type(Train_X)

pandas.core.series.Series

In [14]:
#Vectorize the data using TF-IDF
data_vectorizer = TfidfVectorizer(max_features=5000)
data_vectorizer.fit(df['Cleaned_Headline'])

TrainX_TFIDF = data_vectorizer.transform(Train_X)
TestX_TFIDF = data_vectorizer.transform(Test_X)

In [15]:
print(data_vectorizer.vocabulary_)

{'totestravel': 4534, 'airlin': 323, 'share': 3995, 'tumbl': 4619, 'new': 3057, 'york': 4976, 'impos': 2326, 'quarantin': 3577, 'florida': 1859, 'case': 876, 'spike': 4187, 'american': 375, 'group': 2067, 'inc': 2332, 'plane': 3383, 'prepar': 3468, 'land': 2614, 'airport': 325, 'us': 4720, 'tuesday': 4618, 'april': 443, '18': 40, 'travelnewsinsight': 4576, 'unit': 4689, 'call': 831, 'hong': 2231, 'kong': 2592, 'flight': 1855, 'crew': 1218, 'test': 4444, 'rule': 3839, 'boe': 704, 'co': 1032, 'aircraft': 322, 'sit': 4071, 'intern': 2425, 'china': 961, 'stock': 4259, 'hit': 2205, 'highest': 2186, 'price': 3481, 'sinc': 4061, 'june': 2536, 'travel': 4575, 'hope': 2235, 'airway': 328, 'corp': 1179, 'taxi': 4400, 'next': 3062, 'delta': 1334, 'air': 319, 'line': 2697, 'reach': 3627, 'deal': 1293, '737': 179, 'max': 2816, 'ground': 2064, '30': 114, 'million': 2896, 'employe': 1597, 'mainten': 2765, 'worker': 4933, 'cover': 1198, 'engin': 1613, 'outsid': 3212, 'treasuri': 4578, 'depart': 1345, 

In [16]:
print(TrainX_TFIDF)

  (0, 4738)	0.25926204086868837
  (0, 4025)	0.33923724864975724
  (0, 3894)	0.19651693995009809
  (0, 3882)	0.29428729043889934
  (0, 3506)	0.3039140292127123
  (0, 2633)	0.26926609543941643
  (0, 2168)	0.2664172886466029
  (0, 1326)	0.3741324211252504
  (0, 1203)	0.2884140660681217
  (0, 1185)	0.2558511004962307
  (0, 1089)	0.40989474213327415
  (1, 4943)	0.26750121142090055
  (1, 4940)	0.21947731430520503
  (1, 4386)	0.2507711991188963
  (1, 4065)	0.23537814494651998
  (1, 3721)	0.21947731430520503
  (1, 3542)	0.24094249576105112
  (1, 3314)	0.15433809542924715
  (1, 2947)	0.18478125900488993
  (1, 2649)	0.2030002061648897
  (1, 2573)	0.16753137981621513
  (1, 2086)	0.29020242357305326
  (1, 1536)	0.22421248345904685
  (1, 1138)	0.24217852645735768
  (1, 1081)	0.13475407089250022
  :	:
  (10892, 3645)	0.32452188717094466
  (10892, 3624)	0.15011262620139404
  (10892, 3449)	0.3625317943626292
  (10892, 3381)	0.18900871022435203
  (10892, 3164)	0.11734708000689091
  (10892, 3141)	0.1835

In [23]:
#Fit the training dataset on the classifier, using linear kernel
svm_clf = svm.SVC(kernel='linear', degree=3)
svm_clf.fit(TrainX_TFIDF, Train_Y)

svm_predict = svm_clf.predict(TestX_TFIDF)

#Print the accuracy score
svm_acc = accuracy_score(svm_predict, Test_Y)
print(svm_acc)

0.6650246305418719


In [24]:
#Use Radial Basis Function kernel
rbf_clf = svm.SVC(kernel='rbf', gamma=2)
rbf_clf.fit(TrainX_TFIDF, Train_Y)

rbf_predict = rbf_clf.predict(TestX_TFIDF)

#Print accuracy
rbf_acc = accuracy_score(rbf_predict, Test_Y)
print(rbf_acc)

0.719640179910045


In [28]:
test_sentence = ["research unveil artifici leg capabl simul feel real limb abc news australian broadcast corpor"]
test_lab = 0
test_vec = data_vectorizer.transform(test_sentence)
#print(test_vec)

test_predict = rbf_clf.predict(test_vec)
print(test_predict)

[0]


#### SVM Results
After tuning the hyperparameters, the RBF kernel function gives a better prediction result of 71.96% accuracy.