In [1]:
import nltk
nltk.download('vader_lexicon')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import optimizers

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Keras NN Multiple Classification

In [2]:
df = pd.read_csv('Tweet.csv')
df_up = pd.read_csv('Upsampled.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')

In [4]:
df.head(5) # normal 

Unnamed: 0,Tweet,Platform,Emotion,Positive_Bin
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1


In [5]:
df_up = df_up.drop(columns='Unnamed: 0')

In [6]:
df_up.head(5) # upsampled for increased number of negative tweets

Unnamed: 0,Tweet,Platform,Emotion,Positive_Bin
0,At #sxsw #tapworthy iPad Design Headaches - av...,iPad,Negative emotion,0
1,RT @mention Part of Journalsim is the support ...,,Negative emotion,0
2,Fuck the iphone! RT @mention New #UberSocial f...,iPhone,Negative emotion,0
3,#SXSW 2011: Novelty of iPad news apps fades fa...,iPad,Negative emotion,0
4,New #SXSW rule: no more ooing and ahing over y...,iPad,Negative emotion,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548 entries, 0 to 3547
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Tweet         3548 non-null   object
 1   Platform      3191 non-null   object
 2   Emotion       3548 non-null   object
 3   Positive_Bin  3548 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 111.0+ KB


In [8]:
df_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Tweet         3500 non-null   object
 1   Platform      3171 non-null   object
 2   Emotion       3500 non-null   object
 3   Positive_Bin  3500 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 109.5+ KB


In [9]:
df_up['Positive_Bin'].value_counts()

1    2500
0    1000
Name: Positive_Bin, dtype: int64

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [11]:
sid = SentimentIntensityAnalyzer()

In [12]:
df_up['scores'] = df_up['Tweet'].apply(lambda review:sid.polarity_scores(review))

In [13]:
df_up['compound'] = df_up['scores'].apply(lambda d:d['compound'])

In [14]:
df_up['comp_score'] = df_up['compound'].apply(lambda score: 1 if score >= 0 else 0)

In [15]:
df_up.head()

Unnamed: 0,Tweet,Platform,Emotion,Positive_Bin,scores,compound,comp_score
0,At #sxsw #tapworthy iPad Design Headaches - av...,iPad,Negative emotion,0,"{'neg': 0.153, 'neu': 0.764, 'pos': 0.083, 'co...",-0.2732,0
1,RT @mention Part of Journalsim is the support ...,,Negative emotion,0,"{'neg': 0.0, 'neu': 0.63, 'pos': 0.37, 'compou...",0.8796,1
2,Fuck the iphone! RT @mention New #UberSocial f...,iPhone,Negative emotion,0,"{'neg': 0.166, 'neu': 0.834, 'pos': 0.0, 'comp...",-0.5848,0
3,#SXSW 2011: Novelty of iPad news apps fades fa...,iPad,Negative emotion,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1
4,New #SXSW rule: no more ooing and ahing over y...,iPad,Negative emotion,0,"{'neg': 0.083, 'neu': 0.83, 'pos': 0.087, 'com...",0.0258,1


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix

In [17]:
accuracy_score(df_up['Positive_Bin'],df_up['comp_score'])

0.7537142857142857

In [18]:
print(classification_report(df_up['Positive_Bin'],df_up['comp_score']))

              precision    recall  f1-score   support

           0       0.61      0.39      0.47      1000
           1       0.79      0.90      0.84      2500

    accuracy                           0.75      3500
   macro avg       0.70      0.64      0.66      3500
weighted avg       0.74      0.75      0.73      3500



In [19]:
confusion_matrix(df_up['Positive_Bin'],df_up['comp_score'])

array([[ 389,  611],
       [ 251, 2249]], dtype=int64)

In [20]:
full_df = pd.read_csv('Full_DF')

In [21]:
full_df.head()

Unnamed: 0.1,Unnamed: 0,Tweet,Platform,Emotion,Uncertain,Negative,No Emotion,Positive
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [22]:
full_df = full_df.drop(columns='Unnamed: 0')

In [23]:
full_df.head(10)
full_df = full_df.dropna()

In [24]:
tweets = full_df['Tweet']
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
print('sequences type: ' , type(sequences))

sequences type:  <class 'list'>


In [25]:
one_hot_results = tokenizer.texts_to_matrix(tweets, mode='binary')
print('one_hot_results type:', type(one_hot_results))

one_hot_results type: <class 'numpy.ndarray'>


In [26]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index)) 

Found 5963 unique tokens.


In [27]:
# Our coded data
print('Dimensions of our coded results:', np.shape(one_hot_results)) 

Dimensions of our coded results: (3291, 5000)


In [28]:
print(y.shape)
print(one_hot_results.shape)

NameError: name 'y' is not defined

In [None]:
emotion = full_df['Emotion']

# Initialize
le = preprocessing.LabelEncoder() 
le.fit(emotion)
print('Original class labels:')
print(list(le.classes_))
print('\n')
emotion_cat = le.transform(emotion)  

# If you wish to retrieve the original descriptive labels post production
# list(le.inverse_transform([0, 1, 3, 3, 0, 6, 4])) 

print('New product labels:')
print(emotion_cat)
print('\n')

# Each row will be all zeros except for the category for that observation 
print('One hot labels; 4 binary columns, one for each of the categories.') 
product_onehot = to_categorical(emotion_cat)
print(product_onehot)
print('\n')

print('One hot labels shape:')
print(np.shape(product_onehot))

In [None]:
random.seed(123)
test_index = random.sample(range(1,3200), 1500)

test = one_hot_results[test_index]
train = np.delete(one_hot_results, test_index, 0)

label_test = product_onehot[test_index]
label_train = np.delete(product_onehot, test_index, 0)

print('Test label shape:', np.shape(label_test))
print('Train label shape:', np.shape(label_train))
print('Test shape:', np.shape(test))
print('Train shape:', np.shape(train))

In [None]:
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential

In [None]:
# embedding_size = 128
# model.add(Embedding(embedding_size))
# model.add(Dense(16,input_dim=2, activation='relu'))
# model.add(LSTM(8,input_dim=2, activation='relu'))
# model.add(Dense(2, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['precision'])
# model.summary()

In [None]:
# Initialize a sequential model
model = models.Sequential()
# Two layers with relu activation
model.add(layers.Dense(50, activation='relu', input_shape=(5000,)))
model.add(layers.Dense(25, activation='relu'))
model.add(layers.Dense(4, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
history = model.fit(train,
                    label_train,
                    epochs=20,
                    batch_size=32)

In [None]:
history_dict = history.history

In [None]:
history_dict.keys()

In [None]:
history_dict = history.history
loss_values = history_dict['loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'g', label='Training loss')

plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plot the training accuracy vs the number of epochs

acc_values = history_dict['acc'] 

plt.plot(epochs, acc_values, 'r', label='Training acc')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Output (probability) predictions for the test set 
y_hat_test = model.predict(test) 

In [None]:
# Print the loss and accuracy for the training set 
results_train = model.evaluate(train, label_train)
results_train

In [None]:
results_test = model.evaluate(test, label_test)
results_test # model predicts on the test data with 81% accuracy. 