In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re

In [2]:
data = pd.read_csv('Sentiment-training-test.data.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [3]:
data.head()

Unnamed: 0,text,sentiment
0,Register now for @scacompanies Introduction to...,negative
1,RT @CaptMarkKelly: Arizona needs an independen...,neutral
2,RT @CaptMarkKelly: Arizona needs an independen...,neutral
3,RT @arthur_affect: The bullshit \image enhance...,neutral
4,RT @acampbell99: I’m hearing that an announcem...,positive


In [4]:
data = data[data.sentiment != "neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
# removing special chars
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
#
data.head()

Unnamed: 0,text,sentiment
0,register now for scacompanies introduction to ...,negative
4,rt acampbell99 im hearing that an announcement...,positive
6,onaturalia lindseygrahamsc there is 1 h1b for ...,positive
8,you can see the last half century as focused o...,positive
9,data_science amp business_analytics course100 ...,positive


In [5]:
print(data[ data['sentiment'] == 'positive'].size)
print(data[ data['sentiment'] == 'negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt','')
data.head()

62
16


Unnamed: 0,text,sentiment
0,register now for scacompanies introduction to ...,negative
4,acampbell99 im hearing that an announcement o...,positive
6,onaturalia lindseygrahamsc there is 1 h1b for ...,positive
8,you can see the last half century as focused o...,positive
9,data_science amp business_analytics course100 ...,positive


In [6]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
X[:2]

array([[  0,   0,   0,   0,   0,  50, 121,   2, 122,  51,   6,  52,   1,
         10,  13, 123,   5, 124, 125, 126,  13, 127, 128, 129],
       [  0,   0,   0,  53,  14,  54,  16,  19,  55,  12,   9,  34,  56,
         57,  58,  59,  60,  40,  35,  61,  62,  63,  64,  65]],
      dtype=int32)

In [7]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 24, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(31, 24) (31, 2)
(8, 24) (8, 2)


In [9]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f62cc2d5a58>

In [10]:
# Separate majority and minority classes
data_majority = data[data['sentiment'] == 'negative']
data_minority = data[data['sentiment'] == 'positive']

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then 
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [11]:
print('positive data in training:',(train.sentiment == 'positive').sum())
print('negative data in training:',(train.sentiment == 'negative').sum())
print('positive data in test:',(test.sentiment == 'positive').sum())
print('negative data in test:',(test.sentiment == 'negative').sum())

positive data in training: 25
negative data in training: 6
positive data in test: 6
negative data in test: 2


In [12]:
# Separate majority and minority classes in training data for upsampling 
data_majority = train[train['sentiment'] == 'negative']
data_minority = train[train['sentiment'] == 'positive']

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
 
# Display new class counts
print("After upsampling\n",data_upsampled.sentiment.value_counts(),sep = "")

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['text'].values)
X_train = pad_sequences(X_train,maxlen=29)
Y_train = pd.get_dummies(data_upsampled['sentiment']).values
print('x_train shape:',X_train.shape)

X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences(X_test,maxlen=29)
Y_test = pd.get_dummies(test['sentiment']).values
print("x_test shape", X_test.shape)

majority class before upsample: (6, 2)
minority class before upsample: (25, 2)
After upsampling
positive    6
negative    6
Name: sentiment, dtype: int64
x_train shape: (12, 29)
x_test shape (8, 29)


In [13]:
# model
embed_dim = 128
lstm_out = 192

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 29, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 192)               246528    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 386       
Total params: 502,914
Trainable params: 502,914
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
batch_size = 128
# also adding weights
class_weights = {0: 1 ,
                1: 1.6/bias }
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1,
          class_weight=class_weights)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f62c6f68908>

In [15]:
twt = ['I love this movie and music was best of all']
#vectorizing the sentence by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the sentence to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0 312 110   3 158  11 291]]
1/1 - 0s
positive
