# Sentiment Analysis

 - TF-IDF
 - LSTM
 - BERT

### TF-IDF and Classifier

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

import pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv(r'Export_loop-sentiment-pos-neg-train_05112020000000.csv')
test = pd.read_csv(r'sentiment-eval.csv')
train.head(2)

Unnamed: 0,label,text
0,Negative,No one cares about marketing slides - a techni...
1,Positive,Are all three hosts providing storage/capacity...


In [3]:
train['label'] = train['label'].replace(['Negative'], 1)
train['label'] = train['label'].replace(['Positive'], 0)
train.head(2)

Unnamed: 0,label,text
0,1,No one cares about marketing slides - a techni...
1,0,Are all three hosts providing storage/capacity...


#### Adding smile feature in train / test data

In [4]:
neg_sub =':('
train["neg_smile"] = train['text'].str.find(neg_sub)
train.loc[train["neg_smile"]<0, 'neg_smile'] = 0
train.loc[train["neg_smile"]>0, 'neg_smile'] = -1
pos_sub =':)'
train["pos_smile"] = train['text'].str.find(pos_sub)
train.loc[train["pos_smile"]<0, 'pos_smile'] = 0
train.loc[train["pos_smile"]>0, 'pos_smile'] = 1

train['feedback'] = train["neg_smile"] + train["pos_smile"]
train.drop(columns= ['neg_smile', 'pos_smile'], inplace=True)

In [5]:
neg_sub =':('
test["neg_smile"] = test['text'].str.find(neg_sub)
test.loc[test["neg_smile"]<0, 'neg_smile'] = 0
test.loc[test["neg_smile"]>0, 'neg_smile'] = -1
pos_sub =':)'
test["pos_smile"] = test['text'].str.find(pos_sub)
test.loc[test["pos_smile"]<0, 'pos_smile'] = 0
test.loc[test["pos_smile"]>0, 'pos_smile'] = 1

test['feedback'] = test["neg_smile"] + test["pos_smile"]
test.drop(columns= ['neg_smile', 'pos_smile'], inplace=True)

#### Train Pre-processing

In [6]:
train['Clean_Text'] = train['text'].str.replace("[^a-zA-Z#]", " ")
train['Clean_Text'] = train['Clean_Text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
tokenized_text = train['Clean_Text'].apply(lambda x: x.split())

In [7]:
from nltk import PorterStemmer
ps = PorterStemmer()
tokenized_text = tokenized_text.apply(lambda x: [ps.stem(i) for i in x])
tokenized_text.head(2)

0    [No, one, care, about, market, slide, technic,...
1    [are, all, three, host, provid, storag, capac,...
Name: Clean_Text, dtype: object

In [8]:
for i in range(len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
train['Clean_Text']= tokenized_text
train.head()

Unnamed: 0,label,text,feedback,Clean_Text
0,1,No one cares about marketing slides - a techni...,0,No one care about market slide technic how TO ...
1,0,Are all three hosts providing storage/capacity...,0,are all three host provid storag capac Or is o...
2,1,would loved to had managed to get down to the ...,0,would love to had manag to get down to the cam...
3,1,Vending machine at work is out of Dasani water...,0,vend machin at work is out of dasani water boo
4,0,"RT @VMwareEdu: Paul Maritz, CEO and President ...",0,RT vmwareedu paul maritz ceo and presid of vmw...


#### Test Pre-processing

In [9]:
test['Clean_Text'] = test['text'].str.replace("[^a-zA-Z#]", " ")
test['Clean_Text'] = test['Clean_Text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
tokenized_text = test['Clean_Text'].apply(lambda x: x.split())
tokenized_text.head(2)

0    [it, not, like, haven, been, talking, about, i...
1    [With, modern, data, center, you, can, cut, co...
Name: Clean_Text, dtype: object

In [10]:
from nltk import PorterStemmer
ps = PorterStemmer()
tokenized_text = tokenized_text.apply(lambda x: [ps.stem(i) for i in x])
tokenized_text.head(2)

0    [it, not, like, haven, been, talk, about, it, ...
1    [with, modern, data, center, you, can, cut, co...
Name: Clean_Text, dtype: object

In [11]:
for i in range(len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
test['Clean_Text']= tokenized_text
test.head()

Unnamed: 0,id,text,feedback,Clean_Text
0,0,it's not like I haven't been talking about it ...,0,it not like haven been talk about it for year ...
1,1,"With a modern data center, you can cut costs a...",0,with modern data center you can cut cost and i...
2,2,"In today's storage landscape, there is a myria...",0,In today storag landscap there is myriad of ch...
3,3,I hoped that it could be done via GUI. I didn...,0,hope that it could be done via gui didn even k...
4,4,math final tomorrow...im getting an ulcer over it,0,math final tomorrow im get an ulcer over it


#### Train - TF-IDF

In [12]:
tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=400,stop_words='english')
tfidf_matrix=tfidf.fit_transform(train['Clean_Text'])
df_tfidf = pd.DataFrame(tfidf_matrix.todense())
df_tfidf['feedback'] = train.feedback
df_tfidf.shape

(1900, 401)

#### Test - TF-IDF

In [13]:
tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=400,stop_words='english')
test_tfidf_matrix=tfidf.fit_transform(test['Clean_Text'])
test_tfidf = pd.DataFrame(test_tfidf_matrix.todense())
test_tfidf['feedback'] = test.feedback
test_tfidf.shape

(211, 401)

### Train and Validation Set

In [14]:
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(df_tfidf,train['label'],test_size=0.3,random_state=17)

### Model - LGBM

In [471]:
### LGB : 72%

lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [25,50, 75],
              "learning_rate" : [0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200]
             }
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=False)
%time grid_search.fit(x_train_tfidf,y_train_tfidf)
grid_search.best_estimator_

In [15]:
d_train = lgb.Dataset(x_train_tfidf,y_train_tfidf)
params = {"max_depth" : 50, 
          "learning_rate" : 0.1, 
          "num_leaves": 300,
          "n_estimators": 200}

# Without Categorical Features
lgbmodel = lgb.train(params, d_train)



In [17]:
test_acc = metrics.roc_auc_score(y_valid_tfidf,lgbmodel.predict(x_valid_tfidf))
print("Test Accuracy:" f'{test_acc: .2%}')

Test Accuracy: 73.76%


## LSTM

In [677]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [678]:
train = pd.read_csv(r'Export_loop-sentiment-pos-neg-train_05112020000000.csv')
test = pd.read_csv(r'sentiment-eval.csv')
print(train.shape)

(1900, 2)


In [679]:
train.head(2)

Unnamed: 0,label,text
0,Negative,No one cares about marketing slides - a techni...
1,Positive,Are all three hosts providing storage/capacity...


In [659]:
sample = pd.concat([train['text'],test['text']], axis=0)
sample = sample.apply(lambda x: x.lower())
sample = sample.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
sample.shape

max_fatures = 1000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(sample.values)
X = tokenizer.texts_to_sequences(sample.values)
X = pad_sequences(X)
X.shape

(2111, 332)

In [660]:
train_clean = X[0:1900]
test_clean = X[1900:]
print(train_clean.shape, test_clean.shape)

(1900, 332) (211, 332)


In [680]:
print(train[train['label'] == 'Positive'].size)
print(train[train['label'] == 'Negative'].size)
Y = pd.get_dummies(train['label']).values
Y.shape

2026
1774


(1900, 2)

In [685]:
Y[0:5]

array([[1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [686]:
X_train, X_val, Y_train, Y_val = train_test_split(train_clean,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(1273, 332) (1273, 2)
(627, 332) (627, 2)


In [689]:
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 332, 128)          128000    
_________________________________________________________________
spatial_dropout1d_12 (Spatia (None, 332, 128)          0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 196)               254800    
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None


In [690]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f75ab47a4a8>

In [691]:
score,acc = model.evaluate(X_val, Y_val, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.82
acc: 0.71


In [692]:
label_score = model.predict(test_clean,batch_size=1,verbose = 2)

## BERT

In [3]:
# !pip install bert-serving-server  # server
# !pip install bert-serving-client  # client, independent of `bert-serving-server`

In [13]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip && unzip uncased_L-12_H-768_A-12.zip

In [1]:
import re
import pandas as pd
from bert_serving.client import BertClient
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv(r'Export_loop-sentiment-pos-neg-train_05112020000000.csv')
test = pd.read_csv(r'sentiment-eval.csv')

In [5]:
# clean text from noise
def clean_text(text):
    # filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    # remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [6]:
train['clean_text'] = train.text.apply(clean_text)
test['clean_text'] = test.text.apply(clean_text)

In [7]:
train.head()

Unnamed: 0,label,text,clean_text
0,Negative,No one cares about marketing slides - a techni...,no one cares about marketing slides a techni...
1,Positive,Are all three hosts providing storage/capacity...,are all three hosts providing storage capacity...
2,Negative,would loved to had managed to get down to the ...,would loved to had managed to get down to the ...
3,Negative,Vending machine at work is out of Dasani water...,vending machine at work is out of dasani water...
4,Positive,"RT @VMwareEdu: Paul Maritz, CEO and President ...",rt vmwareedu paul maritz ceo and president ...


In [8]:
# split into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(train.clean_text, train.label, test_size=0.25, random_state=42)

print('X_tr shape:',X_tr.shape)

X_tr shape: (1425,)


### Run BERT model

#### Run Bert serving later in sperate session or different notebook

In [23]:
# !bert-serving-start -model_dir uncased/ -num_worker=2 -max_seq_len 50

In [11]:
# make a connection with the BERT server using it's ip address
bc = BertClient()
# get the embedding for train and val sets
X_tr_bert = bc.encode(X_tr.tolist())
X_val_bert = bc.encode(X_val.tolist())
X_tes_bert = bc.encode(test.clean_text.tolist())

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


In [15]:
from sklearn.linear_model import LogisticRegression
# LR model
model_bert = LogisticRegression()
# train
model_bert = model_bert.fit(X_tr_bert, y_tr)
# predict
pred_bert = model_bert.predict(X_val_bert)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val, pred_bert))

0.8147368421052632


In [18]:
test['label'] = model_bert.predict(X_tes_bert)

In [20]:
test.head()

Unnamed: 0,id,text,clean_text,label
0,0,it's not like I haven't been talking about it ...,it's not like i haven't been talking about it ...,Positive
1,1,"With a modern data center, you can cut costs a...",with a modern data center you can cut costs a...,Positive
2,2,"In today's storage landscape, there is a myria...",in today's storage landscape there is a myria...,Negative
3,3,I hoped that it could be done via GUI. I didn...,i hoped that it could be done via gui i didn...,Negative
4,4,math final tomorrow...im getting an ulcer over it,math final tomorrow im getting an ulcer over it,Negative


In [21]:
test[['id','label']].to_csv('sentiment_submission3_BERT.csv')

In [22]:
test.label.value_counts()

Positive    109
Negative    102
Name: label, dtype: int64