In [None]:
# Sentiment Classifier


re.sub(r'@[A-Za-z0-9]+', '', text)
re.sub(r'#', '', text)
re.sub(r'RT[\s]+', '', text)
re.sub(r'https?:\/\/\S+', '', text)

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pickle

In [8]:
df_fin_phrase = pd.read_csv("../Data/Prepared/CleanDatasets/fin_phrase_bank_clean.csv")

In [9]:
df_fin_phrase.head(5)

Unnamed: 0,sentence,label,clean_text,sentiment
0,"According to Gran , the company has no plans t...",1,according gran company plan move production ru...,Neutral
1,Technopolis plans to develop in stages an area...,1,technopolis plan develop stage area le square ...,Neutral
2,The international electronic industry company ...,0,international electronic industry company elco...,Negative
3,With the new production plant the company woul...,2,new production plant company would increase ca...,Positive
4,According to the company 's updated strategy f...,2,according company updated strategy year baswar...,Positive


In [151]:
df_fin_phrase['sentiment'].value_counts()

sentiment
Neutral     2879
Positive    1363
Negative     604
Name: count, dtype: int64

## Simple Random Sample

In [156]:
df_randomised = df_fin_phrase.sample(frac=1)

In [175]:
train_rate = 0.90

In [176]:
df_train = df_randomised[:int(len(df_randomised) * train_rate)]
df_train

Unnamed: 0,sentence,label,clean_text,sentiment
4696,Net sales fell by 5 % from the previous accoun...,0,net sale fell previous accounting period,Negative
4303,After Chuck Smith was laid off on May 30 from ...,0,chuck smith laid may housing consultant job wi...,Negative
129,The Department Store Division reported an incr...,2,department store division reported increase sa...,Positive
3867,The net sales of Healthcare Trade business in ...,1,net sale healthcare trade business eur million...,Neutral
2417,Basware Einvoices Oy will be merged into the p...,1,basware einvoices oy merged parent company pre...,Neutral
...,...,...,...,...
3187,The company does not disclose the sum it appli...,1,company disclose sum applied,Neutral
1736,"With the measures , Suominen Corporation aims ...",2,measure suominen corporation aim ensure compet...,Positive
2113,"However , he expects banks to provide alternat...",2,however expects bank provide alternative finan...,Positive
506,The acquisition is part of Suomen Helasto 's s...,1,acquisition part suomen helasto strategy expan...,Neutral


In [177]:
df_val = df_randomised[int(len(df_randomised) * train_rate):]
df_val

Unnamed: 0,sentence,label,clean_text,sentiment
988,"The sale , which will result in a gain of some...",2,sale result gain eur million second quarter or...,Positive
647,"For the first nine months of 2010 , Talvivaara...",2,first nine month talvivaara net loss narrowed ...,Positive
4503,The previously concluded adaptation measures c...,2,previously concluded adaptation measure concer...,Positive
3248,The Financial Statements and Interim Reports w...,1,financial statement interim report released ar...,Neutral
4695,Net sales fell by 33 % from the third quarter ...,0,net sale fell third quarter eur mn,Negative
...,...,...,...,...
137,`` The purchase of the operations is part of Y...,2,purchase operation part yit strategy expand of...,Positive
3083,Pohjola and cooperative banks have continued t...,1,pohjola cooperative bank continued combine bra...,Neutral
4217,"( ADPnews ) - Dec 30 , 2009 - Finnish investme...",1,adpnews dec finnish investment group neomarkka...,Neutral
2228,The Group 's consolidated net sales for 2009 t...,1,group consolidated net sale totaled billion eu...,Neutral


In [178]:

# Create feature vectors
vectorizer = TfidfVectorizer()


In [179]:
train_vectors = vectorizer.fit_transform(df_train['clean_text'])
test_vectors = vectorizer.transform(df_val['clean_text'])

In [180]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [181]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.167610s; Prediction time: 0.091194s


In [182]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7551020408163265, 'recall': 0.9119718309859155, 'f1-score': 0.8261562998405104, 'support': 284}
negative:  {'precision': 0.6938775510204082, 'recall': 0.5230769230769231, 'f1-score': 0.5964912280701755, 'support': 65}
positive:  {'precision': 0.7634408602150538, 'recall': 0.5220588235294118, 'f1-score': 0.6200873362445414, 'support': 136}


In [183]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    print(classifier_linear.predict(sentences_vector))

['Positive']
['Neutral']
['Neutral']


# Stratified Sample

In [167]:
df_stratified_train, df_stratified_val = train_test_split(df_fin_phrase, test_size=0.10, stratify=df_fin_phrase[['sentiment']].values)

In [168]:
df_stratified_train

Unnamed: 0,sentence,label,clean_text,sentiment
1370,The cooperation will involve Arena Partners bu...,1,cooperation involve arena partner buying share...,Neutral
3435,"Uncertainties still exist , however .",1,uncertainty still exist however,Neutral
1654,The service is intended to allow the people of...,2,service intended allow people thirteen mediter...,Positive
2565,SRV will also build an aqua park with wellness...,1,srv also build aqua park wellness area restaur...,Neutral
622,CDP was established on the initiative of insti...,1,cdp established initiative institutional inves...,Neutral
...,...,...,...,...
1385,The Group owns and operates a fleet of more th...,1,group owns operates fleet dwt container capaci...,Neutral
2682,The site will cover over six hectares .,1,site cover six hectare,Neutral
2262,`` NTC has a geographical presence that comple...,2,ntc geographical presence complement ramirent ...,Positive
2294,"Outokumpu of Finland , stainless steel manufac...",2,outokumpu finland stainless steel manufacturer...,Positive


In [169]:
df_stratified_val

Unnamed: 0,sentence,label,clean_text,sentiment
3965,"Under the transaction agreement , Metsaliitto ...",1,transaction agreement metsaliitto purchase met...,Neutral
241,Operating profit of Kauppalehti group rose to ...,2,operating profit kauppalehti group rose eur mn...,Positive
1428,The StoneGate product family was designed to p...,1,stonegate product family designed provide full...,Neutral
2889,Financial details were not disclosed .,1,financial detail disclosed,Neutral
3984,"With the new arrangement , customer responsibi...",1,new arrangement customer responsibility become...,Neutral
...,...,...,...,...
3337,The proposal that the Board of Directors will ...,1,proposal board director make annual general me...,Neutral
1427,The solutions will be used for upgrading the n...,2,solution used upgrading network russian mso di...,Positive
2352,"H1 '08 H1 '07 Q2 '08 Q2 '07 in mln euro , unle...",1,h h q q mln euro unless otherwise stated net s...,Neutral
4173,"Antti Orkola , president of Kemira GrowHow 's ...",0,antti orkola president kemira growhow industri...,Negative


In [170]:
train_vectors = vectorizer.fit_transform(df_stratified_train['clean_text'])
test_vectors = vectorizer.transform(df_stratified_val['clean_text'])

In [171]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_stratified_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [172]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_stratified_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.151410s; Prediction time: 0.095558s


In [173]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7867867867867868, 'recall': 0.9097222222222222, 'f1-score': 0.8438003220611917, 'support': 288}
negative:  {'precision': 0.6458333333333334, 'recall': 0.5081967213114754, 'f1-score': 0.5688073394495413, 'support': 61}
positive:  {'precision': 0.7596153846153846, 'recall': 0.5808823529411765, 'f1-score': 0.6583333333333333, 'support': 136}


In [194]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    print(classifier_linear.predict(sentences_vector).prod())

Positive
Neutral
Neutral


In [184]:
result=classifier_linear.predict(sentences_vector)

In [195]:
result.view()

array(['Neutral'], dtype=object)

In [186]:
dir(result)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__dlpack__',
 '__dlpack_device__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',

# pickling the vectorizer
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))
# pickling the model
pickle.dump(classifier_linear, open('classifier.sav', 'wb')