In [12]:
# Sentiment Classifier


re.sub(r'@[A-Za-z0-9]+', '', text)
re.sub(r'#', '', text)
re.sub(r'RT[\s]+', '', text)
re.sub(r'https?:\/\/\S+', '', text)

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pickle

In [14]:
df_fin_phrase = pd.read_csv("../../Data/Prepared/CleanDatasets/fin_phrase_bank_clean.csv")

In [15]:
df_fin_phrase.head(5)

Unnamed: 0,sentence,label,clean_text,sentiment
0,"According to Gran , the company has no plans t...",1,according gran company plan move production ru...,Neutral
1,Technopolis plans to develop in stages an area...,1,technopolis plan develop stage area le square ...,Neutral
2,The international electronic industry company ...,0,international electronic industry company elco...,Negative
3,With the new production plant the company woul...,2,new production plant company would increase ca...,Positive
4,According to the company 's updated strategy f...,2,according company updated strategy year baswar...,Positive


In [16]:
df_fin_phrase['sentiment'].value_counts()

sentiment
Neutral     2879
Positive    1363
Negative     604
Name: count, dtype: int64

## Simple Random Sample

In [17]:
df_randomised = df_fin_phrase.sample(frac=1)

In [18]:
train_rate = 0.90

In [19]:
df_train = df_randomised[:int(len(df_randomised) * train_rate)]
df_train

Unnamed: 0,sentence,label,clean_text,sentiment
1715,Aspocomp has repaid its interest bearing liabi...,2,aspocomp repaid interest bearing liability sta...,Positive
2363,A meeting of Glisten shareholders to vote on t...,1,meeting glisten shareholder vote bid held march,Neutral
2623,The deal will have no significant effect on th...,1,deal significant effect acquiring company equi...,Neutral
385,The total delivery volume of paper businesses ...,2,total delivery volume paper business tonne tonne,Positive
3045,New Novator products are supposed to be export...,1,new novator product supposed exported,Neutral
...,...,...,...,...
4791,"In the building and home improvement trade , n...",0,building home improvement trade net sale total...,Negative
1151,"According to Barclays , the F-Secure software ...",1,according barclays f secure software protect u...,Neutral
4454,YIT lodged counter claims against Neste Oil to...,0,yit lodged counter claim neste oil totaling eu...,Negative
3991,"Meanwhile , minority shareholders , expecting ...",1,meanwhile minority shareholder expecting telia...,Neutral


In [20]:
df_val = df_randomised[int(len(df_randomised) * train_rate):]
df_val

Unnamed: 0,sentence,label,clean_text,sentiment
3203,The company says the measures are no longer ne...,1,company say measure longer needed,Neutral
3850,The figure does not include food exports from ...,1,figure include food export finland,Neutral
102,Clothing chain Sepp+ñl+ñ 's net sales increase...,2,clothing chain sepp l net sale increased eur mn,Positive
2058,"Vaahto , which has entered the deal via its su...",2,vaahto entered deal via subsidiary vaahto pulp...,Positive
2581,Teleste was set up in 1954 and is divided into...,1,teleste set divided broadband cable network vi...,Neutral
...,...,...,...,...
2066,`` They 've already allocated money for the in...,1,already allocated money inventory,Neutral
795,`` Our customers now have the chance to make b...,2,customer chance make booking service want one ...,Positive
3432,Two of these contracts are for turntable anode...,1,two contract turntable anode vibrocompactors d...,Neutral
1417,The Russian government will contribute 20 % of...,1,russian government contribute necessary fund said,Neutral


In [21]:

# Create feature vectors
vectorizer = TfidfVectorizer()


In [22]:
train_vectors = vectorizer.fit_transform(df_train['clean_text'])
test_vectors = vectorizer.transform(df_val['clean_text'])

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [None]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.167610s; Prediction time: 0.091194s


In [None]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7551020408163265, 'recall': 0.9119718309859155, 'f1-score': 0.8261562998405104, 'support': 284}
negative:  {'precision': 0.6938775510204082, 'recall': 0.5230769230769231, 'f1-score': 0.5964912280701755, 'support': 65}
positive:  {'precision': 0.7634408602150538, 'recall': 0.5220588235294118, 'f1-score': 0.6200873362445414, 'support': 136}


In [None]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    print(classifier_linear.predict(sentences_vector))

['Positive']
['Neutral']
['Neutral']


# Stratified Sample

In [None]:
df_stratified_train, df_stratified_val = train_test_split(df_fin_phrase, test_size=0.10, stratify=df_fin_phrase[['sentiment']].values)

In [None]:
df_stratified_train

Unnamed: 0,sentence,label,clean_text,sentiment
1370,The cooperation will involve Arena Partners bu...,1,cooperation involve arena partner buying share...,Neutral
3435,"Uncertainties still exist , however .",1,uncertainty still exist however,Neutral
1654,The service is intended to allow the people of...,2,service intended allow people thirteen mediter...,Positive
2565,SRV will also build an aqua park with wellness...,1,srv also build aqua park wellness area restaur...,Neutral
622,CDP was established on the initiative of insti...,1,cdp established initiative institutional inves...,Neutral
...,...,...,...,...
1385,The Group owns and operates a fleet of more th...,1,group owns operates fleet dwt container capaci...,Neutral
2682,The site will cover over six hectares .,1,site cover six hectare,Neutral
2262,`` NTC has a geographical presence that comple...,2,ntc geographical presence complement ramirent ...,Positive
2294,"Outokumpu of Finland , stainless steel manufac...",2,outokumpu finland stainless steel manufacturer...,Positive


In [None]:
df_stratified_val

Unnamed: 0,sentence,label,clean_text,sentiment
3965,"Under the transaction agreement , Metsaliitto ...",1,transaction agreement metsaliitto purchase met...,Neutral
241,Operating profit of Kauppalehti group rose to ...,2,operating profit kauppalehti group rose eur mn...,Positive
1428,The StoneGate product family was designed to p...,1,stonegate product family designed provide full...,Neutral
2889,Financial details were not disclosed .,1,financial detail disclosed,Neutral
3984,"With the new arrangement , customer responsibi...",1,new arrangement customer responsibility become...,Neutral
...,...,...,...,...
3337,The proposal that the Board of Directors will ...,1,proposal board director make annual general me...,Neutral
1427,The solutions will be used for upgrading the n...,2,solution used upgrading network russian mso di...,Positive
2352,"H1 '08 H1 '07 Q2 '08 Q2 '07 in mln euro , unle...",1,h h q q mln euro unless otherwise stated net s...,Neutral
4173,"Antti Orkola , president of Kemira GrowHow 's ...",0,antti orkola president kemira growhow industri...,Negative


In [None]:
train_vectors = vectorizer.fit_transform(df_stratified_train['clean_text'])
test_vectors = vectorizer.transform(df_stratified_val['clean_text'])

In [None]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, df_stratified_train['sentiment'].values)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [None]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(df_stratified_val[['sentiment']].values, prediction_linear, output_dict=True)

Training time: 1.151410s; Prediction time: 0.095558s


In [None]:
print('neutral: ', report['Neutral'])
print('negative: ', report['Negative'])
print('positive: ', report['Positive'])

neutral:  {'precision': 0.7867867867867868, 'recall': 0.9097222222222222, 'f1-score': 0.8438003220611917, 'support': 288}
negative:  {'precision': 0.6458333333333334, 'recall': 0.5081967213114754, 'f1-score': 0.5688073394495413, 'support': 61}
positive:  {'precision': 0.7596153846153846, 'recall': 0.5808823529411765, 'f1-score': 0.6583333333333333, 'support': 136}


In [None]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = vectorizer.transform([sen]) # vectorizing
    print(classifier_linear.predict(sentences_vector).prod())

Positive
Neutral
Neutral


In [None]:
result=classifier_linear.predict(sentences_vector)

In [None]:
result.view()

array(['Neutral'], dtype=object)

In [None]:
dir(result)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__dlpack__',
 '__dlpack_device__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',

# pickling the vectorizer
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))
# pickling the model
pickle.dump(classifier_linear, open('classifier.sav', 'wb')