In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.model_selection import train_test_split


import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

pd.options.plotting.backend = "plotly"

OptionError: 'You can only set the value of existing options'

In [2]:
# Load Tweet dataset
df = pd.read_csv('./data/Twitter_Data.csv')
# Output first five rows
df=df.iloc[0:100000]
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
# dimensionality of the data
df.shape

(100000, 2)

In [4]:
# drop missing rows
df.dropna(axis=0, inplace=True)

In [5]:
# Map tweet categories
df['category'] = df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
# Output first five rows
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [6]:
import re    # RegEx for removing non-letter characters

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *


def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(df['clean_text'][0]))


Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mingze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', df['clean_text'][0])
X, tokenizer = tokenize_pad_sequences(df['clean_text'])
print('After Tokenization & Padding \n', X[0])

Using TensorFlow backend.


Before Tokenization & Padding 
 when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples
After Tokenization & Padding 
 [  41    1  261   73 1717 1091   42 2454    2 1140  223    2  254   33
  157   97   57   71 1105  254   52    3    8  547    3   52 3853    3
 3224    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [9]:
# Convert categorical variable into dummy/indicator variables.
y = pd.get_dummies(df['category'])
# Train and Test split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
# Extracting validation set from the train set
valid_size=1000
X_valid, y_valid = X_train[-valid_size:], y_train[-valid_size:]
X_test, y_test = X_train[:-valid_size], y_train[:-valid_size]

print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_valid.shape, y_valid.shape)
print('Test Set ->', X_test.shape, y_test.shape)

Train Set -> (69999, 50) (69999, 3)
Validation Set -> (1000, 50) (1000, 3)
Test Set -> (68999, 50) (68999, 3)


In [10]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [11]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall

vocab_size = 5000
embedding_size = 32

# Build model
model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Bidirectional(LSTM(32)))
model3.add(Dropout(0.4))
model3.add(Dense(3, activation='softmax'))

print(model3.summary())

# Compile model
model3.compile(loss='categorical_crossentropy', optimizer='adam', 
               metrics=['accuracy', Precision(), Recall()])

# Train model
num_epochs = 1
batch_size = 32
history3 = model3.fit(X_train, y_train,
                      validation_data=(X_valid, y_valid),
                      batch_size=batch_size, epochs=num_epochs)

# Evaluate model on the test set
loss, accuracy, precision, recall = model3.evaluate(X_test, y_test, verbose=0)
# Print metrics
print('')
print('CNN + LSTM Accuracy  : {:.4f}'.format(accuracy))
print('CNN + LSTM Precision : {:.4f}'.format(precision))
print('CNN + LSTM Recall    : {:.4f}'.format(recall))
print('CNN + LSTM F1 Score  : {:.4f}'.format(f1_score(precision, recall)))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 32)            160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 25, 32)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                16640     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 179,939
Trainable params: 179,939
Non-trainable params: 0
________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 69999 samples, validate on 1000 samples
Epoch 1/1

CNN + LSTM Accuracy  : 0.9521
CNN + LSTM Precision : 0.9536
CNN + LSTM Recall    : 0.9509
CNN + LSTM F1 Score  : 0.9522


In [12]:
# # Save the model architecture & the weights
# model1.save('best_model.h5')
# print('Best model saved')

In [13]:
# from keras.models import load_model

# # Load model
# model = load_model('best_model.h5')

def predict_class(text):
    '''Function to predict sentiment class of the passed text'''
    
    sentiment_classes = ['Negative', 'Neutral', 'Positive']
    max_len=50
    
    # Transforms text to a sequence of integers using a tokenizer object
    xt = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    xt = pad_sequences(xt, padding='post', maxlen=max_len)
    # Do the prediction using the loaded model
    yt = model3.predict(xt).argmax(axis=1)
    # Print the predicted sentiment
#     print('The predicted sentiment is', sentiment_classes[yt[0]])  
    return sentiment_classes[yt[0]]

In [14]:
predict_class(['Just bought a sofa set and mattress from Raye. She was very detailed and patient in giving recommendations.'])

'Positive'

# Google Reviews 

In [94]:
google_review=pd.read_csv('./data/google_reviews.csv')

sent=[]

for index,row in google_review.iterrows():
    r=predict_class([row['comments']])
    sent.append(r)

google_review['category']=sent


# The distribution of sentiments
google_review.groupby('category').count().plot(kind='bar')

In [95]:
n=google_review[google_review['category']=='Negative']

for index,row in n.iterrows():
    print(row['comments'])

Bought and paid full items (16 Sept) as they promise to deliver by 28 Sept. They came in at night time with all items broken while still inside packaging. Wth! Today is Oct 4 still undelivered!
Very bad costumer service! They promise again …
Walked in shop at 8:37pm on a thursday night and was greeted with "excuse me, as we are closing shortly at 9pm, pls tell me how can i help you?!
Scanteak Fishing stool broke in June 2020 after 1 week of usage. I fell when the stool gave way suddenly. They sent replacement and did not ask how serious was my fall.

Replacement broke this week. I fell again because the stool gave way …
Very disappointed about my recent purchase. I purchased the so-called designer range Mono chair 2 weeks ago, after careful usage for for less than 2 weeks, found the chair top surface got scratched badly by the zip head of the original …
Recently purchase the cabinet and very disappointed.  The cabinet delivery two times still chip-off.  No quality control.  Don't ever 

# Castlery (forum)

In [120]:
castlery=pd.read_csv('./data/clean_consolidated_castlery (forum).csv')

# drop missing rows
# castlery.dropna(axis=0, inplace=True)

castlery=castlery.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','Subpage/Subreddit','TimeStamp','Year','Source'],axis=1)

sent=[]

for index,row in castlery.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

castlery['category']=sent

# The distribution of sentiments
castlery.groupby('category').count().plot(kind='bar')



In [125]:
# n=castlery[castlery['category']=='Negative']

# for index,row in n.iterrows():
#     print(row['Body'])
#     print('----------------------')

# Furniture (forum)

In [117]:
furniture=pd.read_csv('./data/clean_consolidated_furniture (forum).csv')

# drop missing rows
# furniture.dropna(axis=0, inplace=True)

furniture=furniture.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','TimeStamp','Year','Source'],axis=1)

sent=[]

for index,row in furniture.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

furniture['category']=sent

# The distribution of sentiments
furniture.groupby('category').count().plot(kind='bar')



In [119]:
# n=furniture[furniture['category']=='Negative']

# for index,row in n.iterrows():
#     print(row['Body'])
#     print('---------------------------')

# Grey Sanders (forum)

In [126]:
Grey=pd.read_csv('./data/clean_consolidated_greysanders (forum).csv')

# # drop missing rows
# Grey.dropna(axis=0, inplace=True)

Grey=Grey.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','TimeStamp','Year','Source'],axis=1)

sent=[]

for index,row in Grey.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

Grey['category']=sent

# The distribution of sentiments
Grey.groupby('category').count().plot(kind='bar')

# Scanteak

In [132]:
Scanteak=pd.read_csv('./data/clean_consolidated_scanteak (forum).csv')

# # drop missing rows

Scanteak=Scanteak.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','Subpage/Subreddit','TimeStamp','Year','Source','Search Term/Query'],axis=1)

sent=[]

for index,row in Scanteak.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

Scanteak['category']=sent

# The distribution of sentiments
Scanteak.groupby('category').count().plot(kind='bar')

# Starliving

In [135]:
starliving=pd.read_csv('./data/clean_consolidated_starliving (forum).csv')

# # drop missing rows

starliving=starliving.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','TimeStamp','Year','Source'],axis=1)

sent=[]

for index,row in starliving.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

starliving['category']=sent

# The distribution of sentiments
starliving.groupby('category').count().plot(kind='bar')

# Wihardja

In [137]:
Wihardja=pd.read_csv('./data/clean_consolidated_wihardja (forum).csv')

# # drop  rows

Wihardja=Wihardja.drop(['Unnamed: 0','Author','Reply to','Message Replying to','Permalink',
              'Score','TimeStamp','Year','Source'],axis=1)

sent=[]

for index,row in Wihardja.iterrows():
    r=predict_class([str(row['Body'])])
    sent.append(r)

Wihardja['category']=sent

# The distribution of sentiments
Wihardja.groupby('category').count().plot(kind='bar')