In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [2]:
data = pd.read_csv('twitter.csv',encoding='utf-8')
data.head()

Unnamed: 0,depression,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [3]:
data.shape

(14984, 2)

In [4]:
data.isna().sum()

depression    0
tweet         0
dtype: int64

In [5]:
data['text'] = data['tweet'].str.replace("@", "") 
# Removing links
data['text'] = data['text'].str.replace(r"http\S+", "") 
# Removing Punctuations, Numbers, and Special Characters
data['text'] = data['text'].str.replace("[^a-zA-Z]", " ") 
# Remove stop words
import nltk
stopwords=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text
data['text'] = data['text'].apply(lambda text : remove_stopwords(text.lower()))
data.head()

Unnamed: 0,depression,tweet,text
0,0,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
1,0,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball managed save re...
2,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
3,0,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see
4,0,@Kwesidei not the whole crew,kwesidei whole crew


In [6]:
del data['tweet']

In [7]:
data.head()

Unnamed: 0,depression,text
0,0,upset update facebook texting might cry result...
1,0,kenichan dived many times ball managed save re...
2,0,whole body feels itchy like fire
3,0,nationwideclass behaving mad see
4,0,kwesidei whole crew


In [8]:
data['sentiment'] = np.where(data['depression'] > 3, 1, 0)
data.head()

Unnamed: 0,depression,text,sentiment
0,0,upset update facebook texting might cry result...,0
1,0,kenichan dived many times ball managed save re...,0
2,0,whole body feels itchy like fire,0
3,0,nationwideclass behaving mad see,0
4,0,kwesidei whole crew,0


In [9]:
del data['depression']

In [10]:
data['text'] = data['text'].apply(lambda x: x.lower())
# removing special chars
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
#
data.head()

Unnamed: 0,text,sentiment
0,upset update facebook texting might cry result...,0
1,kenichan dived many times ball managed save re...,0
2,whole body feels itchy like fire,0
3,nationwideclass behaving mad see,0
4,kwesidei whole crew,0


In [11]:


for idx,row in data.iterrows():
    row[0] = row[0].replace('rt','')
data.head()

Unnamed: 0,text,sentiment
0,upset update facebook texting might cry result...,0
1,kenichan dived many times ball managed save re...,0
2,whole body feels itchy like fire,0
3,nationwideclass behaving mad see,0
4,kwesidei whole crew,0


In [12]:
data.shape

(14984, 2)

In [13]:
df = data[data['sentiment']==1]
df.shape

(4684, 2)

In [14]:
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
X[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  506,  451,  452,  206,  338, 1741,   55,    4,
         171,  743],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  191,  225, 1461,  996, 1055,
         268,    5]])

In [15]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 24, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(11987, 24) (11987, 2)
(2997, 24) (2997, 2)


In [17]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 60, batch_size=batch_size, verbose = 1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2217315d208>

In [18]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)



In [19]:
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[1752  369]
 [ 379  497]]
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2121
           1       0.57      0.57      0.57       876

    accuracy                           0.75      2997
   macro avg       0.70      0.70      0.70      2997
weighted avg       0.75      0.75      0.75      2997



In [21]:
twt = ['I am not happy']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0 64]]
1/1 - 0s
positive


In [22]:
twt = ['that is so funny']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0 376]]
1/1 - 0s
positive
