In [1]:
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from gensim.utils import simple_preprocess

import re

import matplotlib as plt

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Tweets.csv')

In [4]:
data.shape

(14640, 15)

In [5]:
data.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [7]:
stop_words = stopwords.words("english")
stemmer = PorterStemmer()

def preprocess_text(doc, remove_handles=True, remove_stopwords=False, stop_words=list(), stem=True):
  
  if remove_handles:
    doc = re.sub("[@](\w|\d[_])+", "", doc, flags=re.I)
  # format numbers so they all match. To avoid being confused as hashtags, I use 'num' instead of #
  doc = re.sub('\d+', "num", doc)
  # use gensim's simple_preprocess to clean up text
  doc = simple_preprocess(doc=doc, deacc=True)

  if remove_stopwords or stem:
    doc_hold = list()
    for token in doc:
      if remove_stopwords:
        if token not in stop_words:
          if stem:
            doc_hold.append(stemmer.stem(token))
      elif stem:
        doc_hold.append(stemmer.stem(token))
      else:
        doc_hold.append(token)

    # If the entire tweet happened to be stop words, replace with "stopword" token
    if len(doc_hold) == 0:
      doc_hold.append("stopword")
    doc = ' '.join(doc_hold)
  
  return doc

In [8]:
# Clean up/preprocess tweet text
texts = data.text.apply(lambda x: preprocess_text(x, remove_handles=True, remove_stopwords=False, stop_words=stop_words, stem=True))

In [9]:
texts[:5]

0                                            what said
1           plu you ve ad commerci to the experi tacki
2         didn today must mean need to take anoth trip
3    it realli aggress to blast obnoxi entertain in...
4                 and it realli big bad thing about it
Name: text, dtype: object

In [12]:
# set y and encode labels
y = data.airline_sentiment
# Encode the target (sentiment) labels
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=.2, random_state=1)

# Set max vocab size, oov-token, and fit tokenizer
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, filters='!"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', oov_token='OOV')
tokenizer.fit_on_texts(X_train)

# Set max sequence length, these are tweets so it can be short
max_seq_len = 50

# Convert training and test data to sequences
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_seq_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_seq_len, padding='post')

# Convert all text to sequences to us for grid search hyperparameter tuning
X = pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=max_seq_len, padding='post')

**Use grid search to tune hyperparameters**

In [13]:
def create_model(filters=256, kernel_size=5, rate=.2):
  input_dim = len(tokenizer.word_index) + 1
  model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(filters, kernel_size),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(rate),
                        Dense(64, activation='relu'),
                        Dropout(rate),
                        Dense(8, activation='relu'),
                        Dropout(rate),
                        Dense(3, activation='softmax')])

  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [15]:
model = KerasClassifier(build_fn=create_model, epochs=5)
param_grid = {'filters' : [ 256, 512], 'kernel_size' : [ 3, 4, 5, 6], 'rate' : [.2, .3, .4, .5] }
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_micro', n_jobs=-1)

In [16]:
history = grid.fit(X=X, y=y)
print(grid.best_params_)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'filters': 256, 'kernel_size': 3, 'rate': 0.2}


**Final Model**

In [14]:
input_dim = len(tokenizer.word_index) + 1
model = Sequential([
                        Embedding(input_dim=input_dim, output_dim=128),
                        Conv1D(256, 3),
                        GlobalMaxPooling1D(),
                        Flatten(),
                        Dropout(.2),
                        Dense(64, activation='relu'),
                        Dropout(.2),
                        Dense(8, activation='relu'),
                        Dropout(.2),
                        Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [15]:
history = model.fit(X_train, y_train, batch_size=64, epochs=15)
y_pred = np.argmax(model.predict(X_test), axis=-1)
print(f1_score(y_test, y_pred, average='micro'))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
0.7520491803278688


**OneVsRest Classifier**

In [16]:


# Create pipeline for preprocessing and classifier
pipe = Pipeline(steps=([('tfidf', TfidfVectorizer(strip_accents='unicode')), ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=250, random_state=0)))]))
# Use grid search to optimize best hyperparameters
param_grid = {'classifier__estimator__C': [2, 3, 4, 5, 6], 'classifier__estimator__penalty' : ['l1', 'l2']}
grid = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='f1_micro', n_jobs=-1)
grid.fit(data.text, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [17]:
# Print best parameters
print(grid.best_params_)

{'classifier__estimator__C': 4, 'classifier__estimator__penalty': 'l2'}


In [19]:
# Split texts into training and testing again since we are using the TdidftVectorizer this time
X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=.2, random_state=1)

# build and train model with best hyperparameters
model = Pipeline(steps=([('tfidf', TfidfVectorizer(strip_accents='unicode')), ('classifier', OneVsRestClassifier(LogisticRegression(C=4, max_iter=250, random_state=0)))]))
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 OneVsRestClassifier(estimator=LogisticRegression(C=4,
                                                                  class_weight=None,


In [20]:
# Test accuracy
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average='micro')

0.7995218579234973