In [121]:
import spacy
import nltk

# Initialize spacy ‘en’ model, keeping only component needed for lemmatization and creating an engine
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [122]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt

data = pd.read_csv('./sentiment analysis_train.csv',encoding='latin-1')

In [123]:
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [124]:
text = data.iloc[0]['Sentence']
text

"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model ."

### Using Tokenization on a sample text

In [125]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

print(word_tokenize(text))

['The', 'GeoSolutions', 'technology', 'will', 'leverage', 'Benefon', "'s", 'GPS', 'solutions', 'by', 'providing', 'Location', 'Based', 'Search', 'Technology', ',', 'a', 'Communities', 'Platform', ',', 'location', 'relevant', 'multimedia', 'content', 'and', 'a', 'new', 'and', 'powerful', 'commercial', 'model', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [126]:
# another way to do same
print([str(token) for token in nlp(text) if not token.is_punct])

['The', 'GeoSolutions', 'technology', 'will', 'leverage', 'Benefon', "'s", 'GPS', 'solutions', 'by', 'providing', 'Location', 'Based', 'Search', 'Technology', 'a', 'Communities', 'Platform', 'location', 'relevant', 'multimedia', 'content', 'and', 'a', 'new', 'and', 'powerful', 'commercial', 'model']


In [127]:
# removing useless info

import re

words = [str(token) for token in nlp(text) if not token.is_punct]
words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
words = [re.sub(r'\S+com', '', word) for word in words]
words = [re.sub(r'\S+@\S+', '', word) for word in words]
words = [word for word in words if word!=' ']
print(words)

['The', 'GeoSolutions', 'technology', 'will', 'leverage', 'Benefon', 's', 'GPS', 'solutions', 'by', 'providing', 'Location', 'Based', 'Search', 'Technology', 'a', 'Communities', 'Platform', 'location', 'relevant', 'multimedia', 'content', 'and', 'a', 'new', 'and', 'powerful', 'commercial', 'model']


### Removing Stopwords from a sample text

In [128]:
#import nltk stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

#change all stopwords into lowercase
stopwords_lower = [s.lower() for s in stopwords]

words=[word.lower() for word in words if word.lower() not in stopwords_lower]
print(words)

['geosolutions', 'technology', 'leverage', 'benefon', 'gps', 'solutions', 'providing', 'location', 'based', 'search', 'technology', 'communities', 'platform', 'location', 'relevant', 'multimedia', 'content', 'new', 'powerful', 'commercial', 'model']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Using Lemmatization on sample text

In [129]:
words1 = [token.lemma_ for token in nlp(" ".join(words)) if not token.is_punct]
print(words1)

['geosolution', 'technology', 'leverage', 'benefon', 'gps', 'solution', 'provide', 'location', 'base', 'search', 'technology', 'community', 'platform', 'location', 'relevant', 'multimedia', 'content', 'new', 'powerful', 'commercial', 'model']


## Function with all 3 pre-processing tasks
1. Tokenization
2. Stop words
3. Lemmatization

In [130]:
def text_preprocessing(str_input): 
     #tokenization, remove punctuation, lemmatization
     words=[token.lemma_ for token in nlp(str_input) if not token.is_punct]

     # remove symbols, websites, email addresses 
     words = [re.sub(r"[^A-Za-z@]", "", word) for word in words] 
     words = [re.sub(r"\S+com", "", word) for word in words]
     words = [re.sub(r"\S+@\S+", "", word) for word in words] 
     words = [word for word in words if word!=' ']
     words = [word for word in words if len(word)!=0] 
 
     #remove stopwords     
     words=[word.lower() for word in words if word.lower() not in stopwords_lower]

     #combine a list into one string   
     cleaned_str = " ".join(words)

     return cleaned_str

## Applying text pre-processing on data provided

In [131]:
data['Sentence_cleaned'] = data['Sentence'].apply(text_preprocessing)

In [132]:
data.head()

Unnamed: 0,Sentence,Sentiment,Sentence_cleaned
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi low bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,last quarter componenta net sale double eur eu...
3,According to the Finnish-Russian Chamber of Co...,neutral,accord finnish russian chamber commerce major ...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sell remain percent stake ...


### Text vectorization on a sample text

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = ['I am Atishay Jain','Image Captioning using deep learning']

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(text)
vector = vectorizer.transform(text)

print(vectorizer.vocabulary_)
print(vector.shape)
print(vector.toarray())

{'atishay': 0, 'jain': 4, 'image': 3, 'captioning': 1, 'using': 6, 'deep': 2, 'learning': 5}
(2, 7)
[[0.70710678 0.         0.         0.         0.70710678 0.
  0.        ]
 [0.         0.4472136  0.4472136  0.4472136  0.         0.4472136
  0.4472136 ]]


## Training and Predicting on splitted data

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#train and test split
X_train, X_test, y_train, y_test = train_test_split(data['Sentence_cleaned'],data['Sentiment'],random_state=0)
#build a machine learning pipeline
est = Pipeline([('vectorizer', TfidfVectorizer(lowercase=False)),('classifier', LogisticRegression(solver='liblinear'))])

#GridSearchCV with a transformer and a estimator
parameters = {'vectorizer__max_df': (0.8,0.9), 'vectorizer__min_df': [20,50,0.1],"classifier__C":numpy.logspace(-3,3,7), "classifier__penalty":["l1","l2"]}

gs=GridSearchCV(est,param_grid=parameters)
#fit the training data
gs.fit(X_train, y_train)
#Evaluate the model
predictions = gs.predict(X_test)

In [135]:
data_predicted = pd.DataFrame({
    'Statement': X_test,
    'true sentiment': y_test,
    'predicted sentiment': predictions
})
data_predicted.head(10)

Unnamed: 0,Statement,true sentiment,predicted sentiment
1891,speak drink today spokesperson olvi say perfor...,positive,neutral
3885,talvivaara also maintain assumption turn cash ...,positive,positive
4554,agricultural newspaper maaseudun tulevaisuus r...,neutral,neutral
4379,hk jan sink ship,negative,neutral
1654,quarterly dilute eps continue operation come e...,positive,positive
4457,grid show dia cap long term range rally beyond...,negative,neutral
1564,bank leasing arm nordea liising end year profi...,neutral,neutral
3294,one headboxe equip modern consistency control ...,neutral,neutral
1757,ruukki delivery include steel structure includ...,neutral,neutral
728,depend market situation project sell year comp...,neutral,neutral


### Accuracy

In [136]:
def accuracy(x,y):
  acc = 0
  for i in range(len(y)):
    if x[i] == y[i]:
      acc += 1
  return (100*(acc/len(y_test)))

### Accuracy on splitted test data

In [137]:
print("Accuracy = ", accuracy(y_test.tolist(),predictions))

Accuracy =  67.95398520953164


### F1 score on splitted test data

In [138]:
from sklearn.metrics import f1_score

print("F1 score = ",f1_score(y_test.tolist(),predictions,average=None))

F1 score =  [0.19298246 0.77735369 0.61198738]


# Applying model on Final Test Data

In [139]:
test_data = pd.read_csv('./sentiment analysis_test.csv', encoding='latin-1')
test_data.head()

Unnamed: 0,Sentence
0,Operating loss totaled EUR 25mn compared to a ...
1,Renewed AB InBev Bid for SABMiller Ups Stake i...
2,Rautaruukki Corporation Stock exchange release...
3,Etteplan targets to employ at least 20 people ...
4,Thanks to its extensive industry and operation...


In [140]:
test_data['processed_sentence'] = test_data['Sentence'].apply(text_preprocessing)
X = test_data['processed_sentence']
test_data.head()

Unnamed: 0,Sentence,processed_sentence
0,Operating loss totaled EUR 25mn compared to a ...,operate loss total eur mn compare profit eur m...
1,Renewed AB InBev Bid for SABMiller Ups Stake i...,renewed ab inbev bid sabmiller ups stake beer ...
2,Rautaruukki Corporation Stock exchange release...,rautaruukki corporation stock exchange release...
3,Etteplan targets to employ at least 20 people ...,etteplan target employ least people borlnge
4,Thanks to its extensive industry and operation...,thank extensive industry operation experience ...


In [141]:
final_predictions = gs.predict(X)
test_data['predicted_sentiment'] = final_predictions
test_data.head(10)

Unnamed: 0,Sentence,processed_sentence,predicted_sentiment
0,Operating loss totaled EUR 25mn compared to a ...,operate loss total eur mn compare profit eur m...,neutral
1,Renewed AB InBev Bid for SABMiller Ups Stake i...,renewed ab inbev bid sabmiller ups stake beer ...,neutral
2,Rautaruukki Corporation Stock exchange release...,rautaruukki corporation stock exchange release...,positive
3,Etteplan targets to employ at least 20 people ...,etteplan target employ least people borlnge,neutral
4,Thanks to its extensive industry and operation...,thank extensive industry operation experience ...,neutral
5,$FB rejecting HIGHS shortable...at 109,fb reject highs shortable,positive
6,Why AstraZeneca plc & Dixons Carphone PLC Are ...,astrazeneca plc dixons carphone plc red hot gr...,positive
7,Finnish automation solutions developer Cencorp...,finnish automation solution developer cencorp ...,neutral
8,Secure your files online Like filling out a ta...,secure file online like fill tax return make b...,neutral
9,$COST short finally making gains. I will take ...,cost short finally make gain take half gain,negative
