# Sentiment Classifier

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pickle

In [3]:
df_fin_phrase = pd.read_csv("../../Data/Prepared/CleanDatasets/fin_phrase_bank_clean.csv")

In [4]:
df_fin_phrase.head(5)

Unnamed: 0,sentence,label,clean_text,sentiment
0,"According to Gran , the company has no plans t...",1,according gran company plan move production ru...,Neutral
1,Technopolis plans to develop in stages an area...,1,technopolis plan develop stage area le square ...,Neutral
2,The international electronic industry company ...,0,international electronic industry company elco...,Negative
3,With the new production plant the company woul...,2,new production plant company would increase ca...,Positive
4,According to the company 's updated strategy f...,2,according company updated strategy year baswar...,Positive


In [5]:
df_fin_phrase['sentiment'].value_counts()

sentiment
Neutral     2879
Positive    1363
Negative     604
Name: count, dtype: int64

In [12]:
df_fin_phrase.isnull().sum()

sentence      0
label         0
clean_text    1
sentiment     0
dtype: int64

In [13]:
df_fin_phrase.dropna(inplace=True)

## Training with Simple Random Sample

In [14]:
df_randomised = df_fin_phrase.sample(frac=1)

In [15]:
train_rate = 0.90

In [16]:
df_train = df_randomised[:int(len(df_randomised) * train_rate)]
df_train

Unnamed: 0,sentence,label,clean_text,sentiment
2445,"Empresa de Desarrollo Urbano de Quito , INNOVA...",1,empresa de desarrollo urbano de quito innovar ...,Neutral
778,"Ragutis , which is based in Lithuania 's secon...",2,ragutis based lithuania second largest city ka...,Positive
868,Nokia controls more than 50 percent of phone s...,2,nokia control percent phone sale india africa ...,Positive
1108,The service also enables users to watch e-mail...,1,service also enables user watch e mail html fo...,Neutral
4167,"Alpina Sports is a Lebanon , New Hampshire USA...",1,alpina sport lebanon new hampshire usa based d...,Neutral
...,...,...,...,...
612,"OUTOTEC OYJ PRESS RELEASE DECEMBER 4 , 2009 10...",2,outotec oyj press release december outotec est...,Positive
4793,Finnish Suominen Corporation that makes wipes ...,0,finnish suominen corporation make wipe nonwove...,Negative
2013,"Fiskars , a 360-year-old global business best ...",2,fiskars year old global business best known or...,Positive
1606,The Annual General Meeting approved a dividend...,1,annual general meeting approved dividend eur p...,Neutral


In [17]:
df_val = df_randomised[int(len(df_randomised) * train_rate):]
df_val

Unnamed: 0,sentence,label,clean_text,sentiment
481,Calls to the switchboard and directory service...,1,call switchboard directory service decreased s...,Neutral
2263,`` Several growth initiatives in the chosen ge...,2,several growth initiative chosen geographic ar...,Positive
2491,It moved into the No. 2 spot in 2000 when it m...,1,moved spot merged steel operation avesta sheff...,Neutral
1792,We are very pleased with the fine co-operation...,2,pleased fine co operation two country recent t...,Positive
3382,The train is expected to cross the Russian ter...,1,train expected cross russian territory day rea...,Neutral
...,...,...,...,...
4832,Comparable operating profit totaled EUR 4.7 mn...,0,comparable operating profit totaled eur mn eur...,Negative
181,Diluted earnings per share ( EPS ) rose to EUR...,2,diluted earnings per share eps rose eur versus...,Positive
4721,The Group 's order portfolio decreased from EU...,0,group order portfolio decreased eur mn eur mn,Negative
2757,Swedish engineering consultant firm Etteplan i...,1,swedish engineering consultant firm etteplan e...,Neutral


In [18]:
# Create feature vectors
vectorizer = TfidfVectorizer()

In [19]:
train_vectors = vectorizer.fit_transform(df_train['clean_text'])
test_vectors = vectorizer.transform(df_val['clean_text'])

In [20]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [21]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.149691s; Prediction time: 0.095421s


In [22]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7692307692307693, 'recall': 0.9154929577464789, 'f1-score': 0.8360128617363345, 'support': 284}
negative:  {'precision': 0.85, 'recall': 0.5396825396825397, 'f1-score': 0.6601941747572816, 'support': 63}
positive:  {'precision': 0.7289719626168224, 'recall': 0.5652173913043478, 'f1-score': 0.636734693877551, 'support': 138}


In [23]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    print(classifier_linear.predict(sentences_vector))

['Positive']
['Neutral']
['Neutral']


# Training with Stratified Sample

In [24]:
df_stratified_train, df_stratified_val = train_test_split(df_fin_phrase, test_size=0.10, stratify=df_fin_phrase[['sentiment']].values)

In [25]:
df_stratified_train

Unnamed: 0,sentence,label,clean_text,sentiment
1943,Business boomed after Ostrom helped plant a sm...,2,business boomed ostrom helped plant small stor...,Positive
3095,Proline Plus is available in both adjustable s...,1,proline plus available adjustable single multi...,Neutral
3121,Simultaneously with this merger plan another m...,1,simultaneously merger plan another merger plan...,Neutral
786,The estimated synergy benefits are at least EU...,2,estimated synergy benefit least eur annually,Positive
206,"However , net sales in 2010 are seen to have g...",2,however net sale seen grown eur eur,Positive
...,...,...,...,...
326,Earnings per share EPS rose to EUR 0.11 from E...,2,earnings per share eps rose eur eur,Positive
3844,The expanded company will continue to be calle...,1,expanded company continue called newpage,Neutral
4413,Okmetic expects its net sales for the first ha...,0,okmetic expects net sale first half le,Negative
1278,Linde acts responsibly towards its shareholder...,1,linde act responsibly towards shareholder busi...,Neutral


In [26]:
df_stratified_val

Unnamed: 0,sentence,label,clean_text,sentiment
347,Net sales surged by 30 % to EUR 36 million .,2,net sale surged eur million,Positive
2503,Latvia 's Stockmann shopping mall is a subsidi...,1,latvia stockmann shopping mall subsidiary finl...,Neutral
4039,"Operating loss totalled EUR 5.2 mn , compared ...",0,operating loss totalled eur mn compared loss e...,Negative
4495,"`` After a long , unprofitable period the Food...",2,long unprofitable period food division posted ...,Positive
31,The company 's net profit rose 11.4 % on the y...,2,company net profit rose year million euro sale...,Positive
...,...,...,...,...
2686,The wireless industry is bracing itself for th...,1,wireless industry bracing iphone launch june,Neutral
3181,The companies aim to close the deal before the...,1,company aim close deal end year,Neutral
1138,A quick `` one-stop-shop '' to understand the ...,1,quick one stop shop understand company,Neutral
1099,The issuer is solely responsible for the conte...,1,issuer solely responsible content announcement,Neutral


In [27]:
train_vectors = vectorizer.fit_transform(df_stratified_train['clean_text'])
test_vectors = vectorizer.transform(df_stratified_val['clean_text'])

In [28]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_stratified_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [29]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_stratified_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.163846s; Prediction time: 0.088196s


In [30]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7450980392156863, 'recall': 0.9236111111111112, 'f1-score': 0.8248062015503876, 'support': 288}
negative:  {'precision': 0.7142857142857143, 'recall': 0.4918032786885246, 'f1-score': 0.5825242718446602, 'support': 61}
positive:  {'precision': 0.7441860465116279, 'recall': 0.47058823529411764, 'f1-score': 0.5765765765765766, 'support': 136}


In [33]:
def inference(sen):
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    return classifier_linear.predict(sentences_vector).prod()
    

In [34]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    print(inference(sen))

Positive
Neutral
Neutral


### Run inference on twitter validation data

In [32]:
df_prediction = pd.read_csv('../../Data/Prepared/CleanDatasets/Tweet_valid_clean.csv')

In [35]:
df_prediction['label'] = df_prediction['clean_text'].apply(inference)

In [36]:
df_prediction.head(5)

Unnamed: 0,text,label,clean_text,topic
0,Analyst call of the day for @CNBCPro subscribe...,Neutral,analyst call day cnbcpro subscriber goldman sa...,Analyst Update
1,"Loop upgrades CSX to buy, says it's a good pla...",Positive,loop upgrade csx buy say good place park money...,Analyst Update
2,BofA believes we're already in a recession — a...,Neutral,bofa belief already recession say stock take b...,Analyst Update
3,JPMorgan sees these derivative plays as best w...,Neutral,jpmorgan see derivative play best way bet elec...,Analyst Update
4,Morgan Stanley's Huberty sees Apple earnings m...,Neutral,morgan stanley huberty see apple earnings miss...,Analyst Update


In [38]:
# pickling the vectorizer
pickle.dump(vectorizer, open('../../Data/Models/svc_vectorizer.sav', 'wb'))
# pickling the model
pickle.dump(classifier_linear, open('../../Data/Models/svc_classifier.sav', 'wb'))

In [40]:
df_prediction.to_csv("../../Data/Prepared/Svc/prediction_tweet_validation.csv")