Some exploratory ideas on this include

when tweets are negative, what topics do travelers tend to discuss?
when tweets are positive, what are travelers happy with?
outside of sentiment, what other systematic variation exists in the tweets about different airlines?
what types of tweets on airlines tend to be retweeted?

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt  # graphic Library
import seaborn as sns


%matplotlib inline 

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize #sentence
from nltk.tokenize import TweetTokenizer #tweeter
from nltk.probability import FreqDist
from nltk.corpus import stopwords # Ex: a, the, and
from nltk.stem import WordNetLemmatizer # break down words in to dictionary
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
#from nltk.corpus import names  

from string import punctuation

#if the next cell does not work
#remove number symbol on following lines and re-run this cell
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/may/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/may/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to /Users/may/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/may/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/may/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
df = pd.read_csv("Tweets.csv")  #read EXCEL.FILE
#1st column needs [encoding] to read pound/hashtag(#) symbol

df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
len(df)

14640

In [5]:
df.shape

(14640, 15)

In [6]:
df.dtypes

tweet_id                          int64
airline_sentiment                object
airline_sentiment_confidence    float64
negativereason                   object
negativereason_confidence       float64
airline                          object
airline_sentiment_gold           object
name                             object
negativereason_gold              object
retweet_count                     int64
text                             object
tweet_coord                      object
tweet_created                    object
tweet_location                   object
user_timezone                    object
dtype: object

In [7]:
#count number of Not-NA Values
df.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
negativereason                   9178
negativereason_confidence       10522
airline                         14640
airline_sentiment_gold             40
name                            14640
negativereason_gold                32
retweet_count                   14640
text                            14640
tweet_coord                      1019
tweet_created                   14640
tweet_location                   9907
user_timezone                    9820
dtype: int64

In [8]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [9]:
df2 = df[['airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'airline', 'text', 'tweet_created', 'user_timezone']]

In [10]:
df2.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,airline,text,tweet_created,user_timezone
0,neutral,1.0,,Virgin America,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,Eastern Time (US & Canada)
1,positive,0.3486,,Virgin America,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,Pacific Time (US & Canada)
2,neutral,0.6837,,Virgin America,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,Central Time (US & Canada)
3,negative,1.0,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,Pacific Time (US & Canada)
4,negative,1.0,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,Pacific Time (US & Canada)


In [11]:
df1_review = df[['text']]

In [12]:
df1_review.head()

Unnamed: 0,text
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...


In [13]:
#create a [function] = "reviewSentiment" to clean up each review
#then it will analyze and assign a [sentiment polarity]
def reviewSentiment(review):
    
    #make text lowercase
    #review = review.lower()
    
    #tokenize the review
    df1_review = word_tokenize(review)
    
    #remove puntuation
    for token in df1_review:
        if token in punctuation:
            df1_review.remove(token)
    
    clean_tokens = []
    #remove filler words
    for token in df1_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    #clean_review = ' '.join(tknz_review)
    
    #turn into textblob using "sid.polarity_scores"
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity
    r_comp = sid_rev['compound']
    
    return r_comp

In [14]:
from textblob import TextBlob

In [15]:
# import STOPWORD library
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/may/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
#lets find out what is a list of stopwords in English
eng_stopwords = stopwords.words('english') # function "stopwords"
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
sid = SentimentIntensityAnalyzer() # this is function from NLTK Library

In [18]:
#create a new column to hold sentiment value from function
df2['review_sentiment'] = df['text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
df2.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,airline,text,tweet_created,user_timezone,review_sentiment
0,neutral,1.0,,Virgin America,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,Eastern Time (US & Canada),0.0
1,positive,0.3486,,Virgin America,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,Pacific Time (US & Canada),0.0
2,neutral,0.6837,,Virgin America,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,Central Time (US & Canada),0.0
3,negative,1.0,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,Pacific Time (US & Canada),-0.3306
4,negative,1.0,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,Pacific Time (US & Canada),-0.5829


In [20]:
#create a function to assign a [polarity category] to the sentiment
def sentimentCategory(sent_num):
    if sent_num >= 0.2:
        return "positive"
    if sent_num <= -0.2:
        return "negative"
    else:
        return "neutral"

In [21]:
#create a new column to hold sentiment category
df2['sentiment_category'] = df2['review_sentiment'].apply(sentimentCategory)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
df2.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,airline,text,tweet_created,user_timezone,review_sentiment,sentiment_category
0,neutral,1.0,,Virgin America,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,Eastern Time (US & Canada),0.0,neutral
1,positive,0.3486,,Virgin America,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,Pacific Time (US & Canada),0.0,neutral
2,neutral,0.6837,,Virgin America,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,Central Time (US & Canada),0.0,neutral
3,negative,1.0,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,Pacific Time (US & Canada),-0.3306,negative
4,negative,1.0,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,Pacific Time (US & Canada),-0.5829,negative


In [23]:
df2.to_csv('airlinesentiment.csv')