# 1. Import the libraries, load dataset, print shape of data, data description

In [2]:
import re, string, unicodedata                          # Import Regex, string and unicodedata.
from bs4 import BeautifulSoup                           # Import BeautifulSoup.

import numpy as np                                      # Import numpy.
import pandas as pd                                     # Import pandas.
import nltk                                             # Import Natural Language Tool-Kit.

from nltk.tokenize import word_tokenize                 # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

#!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

  import pandas.util.testing as tm


In [55]:
tweets = pd.read_csv('tweets.csv')

In [25]:
tweets.shape

(14640, 15)

In [5]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [7]:
pd.set_option('display.max_colwidth', None) # Display full dataframe information (Non-turncated Text column.)
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


# 2. Understanding of data-columns

## a. Drop all other columns except “text” and “airline_sentiment”

In [56]:
tweets = tweets[['text', 'airline_sentiment']]

## b. Check the shape of data

In [17]:
tweets.shape

(14640, 2)

## c. Print first 5 rows of data

In [57]:
tweets.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials to the experience... tacky.,positive
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,neutral
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",negative
4,@VirginAmerica and it's a really big bad thing about it,negative


# 3. Text pre-processing: Data preparation

## a. Html tag removal

In [59]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

tweets['text'] = tweets['text'].apply(lambda x: strip_html(x))

## b. Tokenization

In [60]:
tweets['text'] = tweets.apply(lambda row: word_tokenize(row['text']), axis=1) 
tweets.head()

Unnamed: 0,text,airline_sentiment
0,"[@, VirginAmerica, What, @, dhepburn, said, .]",neutral
1,"[@, VirginAmerica, plus, you, 've, added, commercials, to, the, experience, ..., tacky, .]",positive
2,"[@, VirginAmerica, I, did, n't, today, ..., Must, mean, I, need, to, take, another, trip, !]",neutral
3,"[@, VirginAmerica, it, 's, really, aggressive, to, blast, obnoxious, ``, entertainment, '', in, your, guests, ', faces, &, they, have, little, recourse]",negative
4,"[@, VirginAmerica, and, it, 's, a, really, big, bad, thing, about, it]",negative


## c. Remove the numbers

In [61]:
def remove_numbers(words):
    """Remove numbers from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+', '', word)
        new_words.append(new_word)
    return new_words

tweets['text'] = tweets['text'].apply(lambda x: remove_numbers(x))

## d. Removal of Special Characters and Punctuations

In [62]:
def remove_special_chars(words):
    """Remove non-ASCII special characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


tweets['text'] = tweets['text'].apply(lambda x: remove_special_chars(x))
tweets['text'] = tweets['text'].apply(lambda x: remove_punctuation(x))

## e. Conversion to lowercase

In [63]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

tweets['text'] = tweets['text'].apply(lambda x: to_lowercase(x))

## f. Lemmatize

In [44]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kumarashok/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [64]:
def lemmatize_list(words):
    lemmatizer = WordNetLemmatizer()
    new_words = []
    for word in words:
        new_words.append(lemmatizer.lemmatize(word))
    return new_words

tweets['text'] = tweets['text'].apply(lambda x: lemmatize_list(x))

## g. Join the words in the list

In [65]:
def join_words(words):
    return ' '.join(words)

tweets['text'] = tweets['text'].apply(lambda x: join_words(x))

## h. Print first 5 rows of data after pre-processing

In [66]:
tweets.head()

Unnamed: 0,text,airline_sentiment
0,virginamerica what dhepburn said,neutral
1,virginamerica plus you ve added commercial to the experience tacky,positive
2,virginamerica i did nt today must mean i need to take another trip,neutral
3,virginamerica it s really aggressive to blast obnoxious entertainment in your guest face they have little recourse,negative
4,virginamerica and it s a really big bad thing about it,negative


# 4. Vectorization

## a. Use CountVectorizer

In [99]:
count_vectorizer = CountVectorizer(max_features=5000)
tweets_cv_features = count_vectorizer.fit_transform(tweets['text'])
tweets_cv_features = tweets_cv_features.toarray()

In [83]:
count_vectorizer.vocabulary_

{'virginamerica': 934,
 'what': 957,
 'said': 729,
 'plus': 652,
 'you': 996,
 've': 927,
 'to': 869,
 'the': 847,
 'experience': 282,
 'did': 231,
 'nt': 590,
 'today': 870,
 'must': 562,
 'mean': 526,
 'need': 569,
 'take': 829,
 'another': 38,
 'trip': 888,
 'it': 447,
 'really': 684,
 'entertainment': 265,
 'in': 437,
 'your': 997,
 'they': 853,
 'have': 391,
 'little': 493,
 'and': 36,
 'big': 96,
 'bad': 80,
 'thing': 854,
 'about': 2,
 'seriously': 753,
 'would': 986,
 'pay': 634,
 'flight': 317,
 'for': 336,
 'seat': 740,
 'that': 846,
 'this': 856,
 'only': 608,
 'flying': 330,
 'yes': 993,
 'every': 272,
 'time': 865,
 'fly': 328,
 'go': 365,
 'away': 75,
 'missed': 546,
 'without': 973,
 'there': 851,
 'http': 423,
 'well': 954,
 'now': 589,
 'do': 238,
 'wa': 937,
 'amazing': 31,
 'arrived': 57,
 'an': 35,
 'hour': 418,
 'early': 256,
 're': 679,
 'too': 875,
 'good': 370,
 'me': 524,
 'know': 463,
 'is': 445,
 'second': 742,
 'cause': 138,
 'of': 594,
 'pretty': 662,
 'so'

In [100]:
tweets_cv_features.shape

(14640, 5000)

## a. Use TfidfVectorizer

In [101]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tweets_tfidf_features = tfidf_vectorizer.fit_transform(tweets['text'])
tweets_tfidf_features = tweets_tfidf_features.toarray()

In [86]:
tfidf_vectorizer.vocabulary_

{'virginamerica': 934,
 'what': 957,
 'said': 729,
 'plus': 652,
 'you': 996,
 've': 927,
 'to': 869,
 'the': 847,
 'experience': 282,
 'did': 231,
 'nt': 590,
 'today': 870,
 'must': 562,
 'mean': 526,
 'need': 569,
 'take': 829,
 'another': 38,
 'trip': 888,
 'it': 447,
 'really': 684,
 'entertainment': 265,
 'in': 437,
 'your': 997,
 'they': 853,
 'have': 391,
 'little': 493,
 'and': 36,
 'big': 96,
 'bad': 80,
 'thing': 854,
 'about': 2,
 'seriously': 753,
 'would': 986,
 'pay': 634,
 'flight': 317,
 'for': 336,
 'seat': 740,
 'that': 846,
 'this': 856,
 'only': 608,
 'flying': 330,
 'yes': 993,
 'every': 272,
 'time': 865,
 'fly': 328,
 'go': 365,
 'away': 75,
 'missed': 546,
 'without': 973,
 'there': 851,
 'http': 423,
 'well': 954,
 'now': 589,
 'do': 238,
 'wa': 937,
 'amazing': 31,
 'arrived': 57,
 'an': 35,
 'hour': 418,
 'early': 256,
 're': 679,
 'too': 875,
 'good': 370,
 'me': 524,
 'know': 463,
 'is': 445,
 'second': 742,
 'cause': 138,
 'of': 594,
 'pretty': 662,
 'so'

In [102]:
tweets_tfidf_features.shape

(14640, 5000)

# 5. Fit and evaluate model 

## a. Using CountVectorizer

In [89]:
labels = tweets['airline_sentiment']

In [103]:
# Split data into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(tweets_cv_features, labels, test_size=0.3, random_state=42)

In [104]:
# Using Random Forest to build model
# Also calculating the cross validation score.
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(X_train, y_train)
print(np.mean(cross_val_score(forest, tweets_cv_features, labels, cv=5)))

0.7061475409836067


In [105]:
# Predict the result for test data using the model built above.
result = forest.predict(X_test)

In [109]:
# Print Confusion matrix
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)

[[2656  129   29]
 [ 443  381   60]
 [ 260  113  321]]


## b. Using TfidfVectorizer

In [110]:
# Split data into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(tweets_tfidf_features, labels, test_size=0.3, random_state=42)

In [112]:
# Using Random Forest to build model
# Also calculating the cross validation score.
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(X_train, y_train)
print(np.mean(cross_val_score(forest, tweets_tfidf_features, labels, cv=5)))

0.7011612021857923


In [114]:
# Predict the result for test data using the model built above.
result = forest.predict(X_test)

In [115]:
# Print Confusion matrix
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)

[[2665  113   36]
 [ 482  349   53]
 [ 286  101  307]]


# 6. Summary

Pre-processing:

We performed various preprocessing steps like:
- html tag removal: to remove the html tags that carry no information
- tokenization: to split sentences into individual words to enable processing on each word
- numbers, special characters and punctuation removal: removed these as they don't have any significance in the context of sentiment analysis
- conversion to lowercase: to ensure the vocabulary doesn't have duplicate words with differnt casing
- Lemmatize: to reudce multiple word variants to their root form
- joined back all the words to form the sentence to do vectorization

Count Vectorization:
- We used count vectorization with max_features=5000 to generate the 5000 features/vocabulary of the most frequent words in the corpus

Tfidf Vectorizaton:
- We used tfidf vectorization with max_features=5000 to generate the 5000 features/vocabulary of the most frequent words in the corpus. But unlike count vectorization, the features are not integers and are instead floats and they take into account the commanality of words across documents.

Performance:

The performance of the random forest model is similar at 70% with both the vectorization methods, since for sentiment classification, the tfidf vectorzer doesn't provide any additional value over count vectorizer.