## Importing necessary Libs

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences, to_categorical


In [None]:
nltk.download(['all-corpora'])

## Loading xlsx file to a dataframe

In [2]:
df = pd.read_excel("LabeledText.xlsx")
df.head()

Unnamed: 0,File Name,Caption,LABEL
0,1.txt,How I feel today #legday #jelly #aching #gym,negative
1,10.txt,@ArrivaTW absolute disgrace two carriages from...,negative
2,100.txt,This is my Valentine's from 1 of my nephews. I...,positive
3,1000.txt,betterfeelingfilms: RT via Instagram: First da...,neutral
4,1001.txt,Zoe's first love #Rattled @JohnnyHarper15,positive


In [3]:
df.shape

(4869, 3)

#### See label distribuition of dataset

In [4]:
df['LABEL'].value_counts(normalize=True) * 100

neutral     36.372972
positive    33.805710
negative    29.821319
Name: LABEL, dtype: float64

#### Check if exists null data

In [5]:
df.isnull().sum()

File Name    0
Caption      0
LABEL        0
dtype: int64

## Prepare captions, remove stop words and mentions

In [6]:
def remove_stopwords(input):
    # this lib suport portuguese, but my dataset is in english 
    list_stp = stopwords.words('english')
    whitelist = ["n't", 'not', 'no']
    
    # split phrase by word
    words  = input.split()
    clean = [word for word in words if word not in list_stp or word in whitelist and len(words) > 1]
    return ' '.join(clean)

remove_mention = lambda text: re.sub(r'@\w+', '', text)
remove_hashtag = lambda text: re.sub('#', '', text)
remove_urls = lambda text: re.sub('https?:\/\/\S+', '', text)


def clean_text(input_text):
    clean = remove_stopwords(input_text)
    clean = remove_mention(clean)
    clean = remove_hashtag(clean)
    clean = remove_urls(clean)
    
    
    return clean
 

In [7]:
sample = df['Caption'][1]

clear_text = remove_stopwords(sample)
print(f'remove stop_words: {clear_text}')

clear_text = remove_mention(sample)
print(f'remove mentions: {clear_text}')

clear_text = remove_hashtag(sample)
print(f'remove hashtags: {clear_text}')

clear_text = clean_text(sample)
print(f'text clean: {clear_text}')

remove stop_words: @ArrivaTW absolute disgrace two carriages Bangor half way standing room #disgraced
remove mentions:  absolute disgrace two carriages from Bangor half way there standing room only #disgraced 
remove hashtags: @ArrivaTW absolute disgrace two carriages from Bangor half way there standing room only disgraced 
text clean:  absolute disgrace two carriages Bangor half way standing room disgraced


### Apply clean into text

In [8]:
df['cleaned_text'] = df['Caption'].apply(clean_text)
df.head()

Unnamed: 0,File Name,Caption,LABEL,cleaned_text
0,1.txt,How I feel today #legday #jelly #aching #gym,negative,How I feel today legday jelly aching gym
1,10.txt,@ArrivaTW absolute disgrace two carriages from...,negative,absolute disgrace two carriages Bangor half w...
2,100.txt,This is my Valentine's from 1 of my nephews. I...,positive,This Valentine's 1 nephews. I elated; sometime...
3,1000.txt,betterfeelingfilms: RT via Instagram: First da...,neutral,betterfeelingfilms: RT via Instagram: First da...
4,1001.txt,Zoe's first love #Rattled @JohnnyHarper15,positive,Zoe's first love Rattled


In [9]:
np.quantile(df['cleaned_text'].apply(lambda x: len(x.split())),.5), np.mean(df['cleaned_text'].apply(lambda x: len(x.split())))

(10.0, 9.874512220168413)

In [10]:
unique_chars = set(''.join(df['cleaned_text'].str.lower()))
unique_words = set(np.hstack(df['cleaned_text'].str.lower().str.split()))
avg_words = round(np.mean(df['cleaned_text'].apply(lambda x: len(x.split()))))
max_number_of_words = max(df['cleaned_text'].apply(lambda x: len(x.split())))


print(f'number of unique characters in the cleaned text column: {len(unique_chars)}')
print(f'number of unique words in the cleaned text column: {len(unique_words)}')
print(f'average number of words in the cleaned text column: {avg_words}')
print(f'max number of words in the cleaned text column: {max_number_of_words}')

number of unique characters in the cleaned text column: 160
number of unique words in the cleaned text column: 17125
average number of words in the cleaned text column: 10
max number of words in the cleaned text column: 28


## Splitting the dataset into train and test


In [11]:
x_train, x_test, y_train, y_test = train_test_split(df['cleaned_text'], df['LABEL'], test_size=0.2, stratify=df['LABEL'])

## Tokenizer words

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_text'])

x_train_s = tokenizer.texts_to_sequences(x_train)
x_test_s = tokenizer.texts_to_sequences(x_test)

In [13]:
x_train[:5]

1400    Intrepid 2016 Elite goes 9-2-1 summer season. ...
2749    . Dogs R often stunned electrocution take 3 mi...
363      new garden year delighted thankyouforvisiting...
4063    RT : The new Spectre trailer intense, action-p...
132     plastic workshop Loughborough uni pink sad art...
Name: cleaned_text, dtype: object

In [14]:
x_train_s[:5]

[[7345, 1380, 7346, 705, 282, 20, 25, 229, 918, 159, 3927, 1430, 80, 7347],
 [209, 237, 1196, 365, 10012, 16, 54, 2403, 456, 1618, 10013, 606],
 [8, 1177, 96, 91, 5543, 5544],
 [1, 4, 8, 12363, 1240, 137, 1307, 4805, 462, 166],
 [1354, 5091, 5092, 2980, 767, 254, 127, 5093, 1155, 66]]

In [15]:
tokenizer.sequences_to_texts(x_train_s)[:5]

['intrepid 2016 elite goes 9 2 1 summer season way rep orange blue intrepidfamily',
 'dogs r often stunned electrocution take 3 mins others die dragging stopboknal2015',
 'new garden year delighted thankyouforvisiting longtailedtit',
 'rt the new spectre trailer intense action packed finally here',
 'plastic workshop loughborough uni pink sad art juxtaposition bright vibrant']

### padding sequences to the same length

In [16]:
max_length = max_number_of_words

padded_x_train = pad_sequences(x_train_s, maxlen=max_length, padding='post', truncating='post')
padded_x_test = pad_sequences(x_test_s, maxlen=max_length, padding='post', truncating='post')

### One hot encoding the target variable

In [17]:
mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

y_train_mapped = y_train.map(mapping)
y_test_mapped = y_test.map(mapping)

In [18]:
print('target before  mapping')
print(y_train[:5])

print('target after mapping')
print(y_train_mapped[:5])

target before  mapping
1400    positive
2749    negative
363     positive
4063    negative
132     positive
Name: LABEL, dtype: object
target after mapping
1400    2
2749    0
363     2
4063    0
132     2
Name: LABEL, dtype: int64


In [19]:
y_train_encoded = to_categorical(y_train_mapped)
y_test_encoded = to_categorical(y_test_mapped)

## Building a model to classify the captions

In [20]:
vocab_length = len(tokenizer.word_index) + 1
print(f'Length: {max_length}')
print(f'Vocabulary size: {vocab_length}')

Length: 28
Vocabulary size: 13826


In [21]:
from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import MaxPooling1D
from keras.layers import  GlobalMaxPooling1D, MaxPooling1D

In [22]:
model = Sequential(name='sentiment_classification')

model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=300, input_length=max_length))
model.add(Conv1D(activation="relu", padding="valid", filters=300, kernel_size=7))
model.add(GlobalMaxPooling1D())
model.add(Dense(600, activation='relu'))
model.add(Dense(len(set(y_train)), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 300)           4147800   
                                                                 
 conv1d (Conv1D)             (None, 22, 300)           630300    
                                                                 
 global_max_pooling1d (Globa  (None, 300)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 600)               180600    
                                                                 
 dense_1 (Dense)             (None, 3)                 1803      
                                                                 
Total params: 4,960,503
Trainable params: 4,960,503
Non-trainable params: 0
______________________________________________

In [50]:
model.fit(padded_x_train, np.array(y_train_encoded), epochs=10, verbose=2, validation_split=0.1)

Epoch 1/10
110/110 - 8s - loss: 0.0019 - accuracy: 0.9994 - val_loss: 1.3892 - val_accuracy: 0.6923 - 8s/epoch - 74ms/step
Epoch 2/10
110/110 - 8s - loss: 0.0020 - accuracy: 0.9994 - val_loss: 1.3605 - val_accuracy: 0.6923 - 8s/epoch - 74ms/step
Epoch 3/10
110/110 - 8s - loss: 0.0013 - accuracy: 0.9994 - val_loss: 1.4019 - val_accuracy: 0.6974 - 8s/epoch - 75ms/step
Epoch 4/10
110/110 - 8s - loss: 0.0012 - accuracy: 0.9994 - val_loss: 1.4271 - val_accuracy: 0.6846 - 8s/epoch - 74ms/step
Epoch 5/10
110/110 - 8s - loss: 8.9479e-04 - accuracy: 0.9997 - val_loss: 1.5352 - val_accuracy: 0.6872 - 8s/epoch - 75ms/step
Epoch 6/10
110/110 - 8s - loss: 8.3639e-04 - accuracy: 0.9997 - val_loss: 1.4776 - val_accuracy: 0.6974 - 8s/epoch - 74ms/step
Epoch 7/10
110/110 - 8s - loss: 0.0014 - accuracy: 0.9994 - val_loss: 1.4843 - val_accuracy: 0.6821 - 8s/epoch - 74ms/step
Epoch 8/10
110/110 - 8s - loss: 0.0011 - accuracy: 0.9997 - val_loss: 1.5635 - val_accuracy: 0.6872 - 8s/epoch - 74ms/step
Epoch 9/

<keras.callbacks.History at 0x1858a6a27c0>

In [51]:
loss, acc = model.evaluate(padded_x_test, y_test_encoded)
print(f'Loss: {loss}')
print(f'Accuracy: {round(acc*100,2)}%')

Loss: 1.5056393146514893
Accuracy: 69.1%


## Make predictions

In [70]:
sample = "Happy to see you in the morning"
data = pad_sequences(tokenizer.texts_to_sequences([sample]), maxlen=max_length, padding='post')

In [71]:
prediction =  model.predict(data)
print(f'prediction: {prediction.shape}')

prediction: (1, 3)


In [78]:
result = list(mapping.keys())[prediction.argmax(1)[0]]
print(f"Sentiment: {result}")

Sentiment: positive
