In [51]:
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

import xgboost as xgb

from gensim.utils import simple_preprocess

import re

import matplotlib as plt

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [5]:
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Tweets.csv')

In [None]:
data.shape

(14640, 15)

In [None]:
data.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [52]:
stop_words = stopwords.words("english")
stemmer = PorterStemmer()

def preprocess_text(doc, remove_handles=False, remove_stopwords=False, stop_words=list(), stem=False):
  
  if remove_handles:
    doc = re.sub("[@](\w|\d[_])+", "", doc, flags=re.I)
  # format numbers so they all match. To avoid being confused as hashtags, I use 'num' instead of #
  doc = re.sub('\d+', "num", doc)
  # use gensim's simple_preprocess to clean up text
  doc = simple_preprocess(doc=doc, deacc=True)

  if remove_stopwords or stem:
    doc_hold = list()
    for token in doc:
      if remove_stopwords:
        if token not in stop_words:
          if stem:
            doc_hold.append(stemmer.stem(token))
      elif stem:
        doc_hold.append(stemmer.stem(token))
      else:
        doc_hold.append(token)

    # if the entire tweet happened to be stop words, replace with "stopword" token
    if len(doc_hold) == 0:
      doc_hold.append("stopword")
    doc = ' '.join(doc_hold)
  
  return doc

In [53]:
# clean up/preprocess tweet text
texts = data.text.apply(lambda x: preprocess_text(x, remove_handles=True, remove_stopwords=False, stop_words=stop_words, stem=True))

In [54]:
texts[:5]

0                                            what said
1           plu you ve ad commerci to the experi tacki
2         didn today must mean need to take anoth trip
3    it realli aggress to blast obnoxi entertain in...
4                 and it realli big bad thing about it
Name: text, dtype: object

In [55]:
# Set max vocab size, oov-token, and fit tokenizer
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, filters='!"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', oov_token='OOV')
tokenizer.fit_on_texts(texts)

# Set max sequence length, these are tweets so they can be short
max_seq_len = 60
X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=max_seq_len, padding='post')
y = data.airline_sentiment

# encode the target (sentiment) labels
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

**Use grid search to find best number of filters**

In [None]:
def create_model_filters(filters=256):
  input_dim = len(tokenizer.word_index) + 1
  model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(filters, 5),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(.4),
                        Dense(64, activation='relu'),
                        Dropout(.4),
                        Dense(8, activation='relu'),
                        Dropout(.4),
                        Dense(3, activation='softmax')])

  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(build_fn=create_model_filters, epochs=5)
param_grid = {'filters' : [ 256, 512]}
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_micro', n_jobs=-1)

In [None]:
history = grid.fit(X=X, y=y)
print(grid.best_params_)
print(history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'filters': 256}
GridSearchCV(cv=None, error_score=nan,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f6901bfe0b8>,
             iid='deprecated', n_jobs=-1, param_grid={'filters': [256, 512]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)


**Use grid search to find best kernel size**

In [None]:
def create_model_kernel_size(kernel_size=5):
  input_dim = len(tokenizer.word_index) + 1
  model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(256, kernel_size),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(.4),
                        Dense(64, activation='relu'),
                        Dropout(.4),
                        Dense(8, activation='relu'),
                        Dropout(.4),
                        Dense(3, activation='softmax')])

  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(build_fn=create_model_kernel_size, epochs=5)

In [None]:
param_grid = {'kernel_size' : [ 4, 5, 6]}
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_micro', n_jobs=-1)

In [None]:
history = grid.fit(X=X, y=y)
print(grid.best_params_)
print(history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'kernel_size': 5}
GridSearchCV(cv=None, error_score=nan,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7feada5f28d0>,
             iid='deprecated', n_jobs=-1, param_grid={'kernel_size': [4, 5, 6]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)


**Use grid search to find best dropout**


In [None]:
def create_model_dropout(rate=.2):
  input_dim = len(tokenizer.word_index) + 1
  model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(256, 5),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(rate),
                        Dense(64, activation='relu'),
                        Dropout(rate),
                        Dense(8, activation='relu'),
                        Dropout(rate),
                        Dense(3, activation='softmax')])

  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(build_fn=create_model_dropout, epochs=5)
param_grid = {'rate':[.2, .3, .4, .5]}
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_micro', n_jobs=-1)

In [None]:
history = grid.fit(X, y)
print(grid.best_params_)
print(history)

In [None]:
print(grid.best_params_)
print(history)

{'rate': 0.5}
GridSearchCV(cv=None, error_score=nan,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f672fc13ef0>,
             iid='deprecated', n_jobs=-1,
             param_grid={'rate': [0.2, 0.3, 0.4, 0.5]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring='f1_micro',
             verbose=0)


**Final Model**

In [56]:
input_dim = len(tokenizer.word_index) + 1
model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(256, 5),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(.5),
                        Dense(64, activation='relu'),
                        Dropout(.5),
                        Dense(8, activation='relu'),
                        Dropout(.5),
                        Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [57]:
history = model.fit(X_train, y_train, batch_size=64, epochs=15)
y_pred = np.argmax(model.predict(X_test), axis=-1)
print(f1_score(y_test, y_pred, average='micro'))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
0.7503415300546448
