In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re
import warnings
warnings.filterwarnings("ignore")

In [19]:
#read the data
data = pd.read_csv('Sentiment.csv')
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,name,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,I_Am_Kenzi,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,PeacefulQuest,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,PussssyCroook,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,MattFromTexas31,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,sharonDay5,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [20]:
#lets take the relevant information
data = data [['text','sentiment']]
print("Unique value of target",data.sentiment.nunique())
print("value counts of target",data.sentiment.value_counts())

### Preprocessing ####
# Removing the neutral tweets and resetting the index
data = data[data.sentiment!="Neutral"]
data.reset_index(inplace=True)
data.drop(['index'],axis=1,inplace=True)
#make the data lower case
#remove characters which are not alphabets
data['text'] = data['text'].apply(lambda x : x.lower())
data['text']=data['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
print(data[data.sentiment=='Positive'].shape)
print(data[data.sentiment=='Negative'].shape)
#lets remove the rt from the statements
for idx, row in data.iterrows():
    row[0] = row[0].replace("rt"," ")
print(data.head())

Unique value of target 3
value counts of target Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64
(2236, 2)
(8493, 2)
                                                text sentiment
0    scottwalker didnt catch the full gopdebate l...  Positive
1    robgeorge that carly fiorina is trending  ho...  Positive
2    danscavino gopdebate w realdonaldtrump deliv...  Positive
3    gregabbott_tx tedcruz on my first day i will...  Positive
4    warriorwoman91 i liked her and was happy whe...  Negative


In [33]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features,split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

- Here I will use LSTM network,  the embedding dimension, lstm_out , batch_size, dropout all these values are hyper parameters so we need to play around with these values for efficient algorithm.
- we will use softmax and categorical cross entropy

In [51]:
embed_dim = 128
lstm_out = 400
model = Sequential()
model.add(Embedding(max_features,embed_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 400)               846400    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 802       
Total params: 1,103,202
Trainable params: 1,103,202
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
Y = pd.get_dummies(data['sentiment']).values
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=2)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)


(7510, 28) (7510, 2)
(3219, 28) (3219, 2)


In [52]:

model.fit(X_train,y_train,epochs=15,batch_size=32,verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fca2c240400>

In [53]:
model.evaluate(X_test,y_test)



[0.7893904447555542, 0.8251009583473206]

In [54]:
test_preds = model.predict_classes(X_test)

In [55]:
Y_hat = pd.get_dummies(test_preds).values

In [56]:
from sklearn.metrics import classification_report
print(classification_report(y_test,Y_hat))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      2522
           1       0.61      0.55      0.58       697

   micro avg       0.83      0.83      0.83      3219
   macro avg       0.74      0.73      0.73      3219
weighted avg       0.82      0.83      0.82      3219
 samples avg       0.83      0.83      0.83      3219

