
## Analyze customer sentiment from twitter data

In [103]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter

from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import scipy 
import scipy.stats as stats

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from IPython.display import Image # displaying images files in jupyter
from IPython.display import IFrame # displaying pdf file in jupyter

In [104]:
# Input and clean up data by removing columns not useful for analysis
tweets = pd.read_csv('Tweets.csv')
del tweets['tweet_id']
del tweets['retweet_count']
del tweets['tweet_coord']
del tweets['name']
del tweets['airline_sentiment_gold']
del tweets['tweet_created']
del tweets['user_timezone']
tweets.head() 

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,negativereason_gold,text,tweet_location
0,neutral,1.0,,,Virgin America,,@VirginAmerica What @dhepburn said.,
1,positive,0.3486,,0.0,Virgin America,,@VirginAmerica plus you've added commercials t...,
2,neutral,0.6837,,,Virgin America,,@VirginAmerica I didn't today... Must mean I n...,Lets Play
3,negative,1.0,Bad Flight,0.7033,Virgin America,,@VirginAmerica it's really aggressive to blast...,
4,negative,1.0,Can't Tell,1.0,Virgin America,,@VirginAmerica and it's a really big bad thing...,


In [105]:
# scanning data
print ("Airlines: ", tweets['airline'].dropna().unique())
print (" ")
print ("Sentiment: ", tweets['airline_sentiment'].dropna().unique())
print (" ")
print ("Negative comments: ", tweets['negativereason'].dropna().unique())
print (" ")
print ("Locations: ", tweets['tweet_location'].dropna().unique())

Airlines:  ['Virgin America' 'United' 'Southwest' 'Delta' 'US Airways' 'American']
 
Sentiment:  ['neutral' 'positive' 'negative']
 
Negative comments:  ['Bad Flight' "Can't Tell" 'Late Flight' 'Customer Service Issue'
 'Flight Booking Problems' 'Lost Luggage' 'Flight Attendant Complaints'
 'Cancelled Flight' 'Damaged Luggage' 'longlines']
 
Locations:  ['Lets Play' 'San Francisco CA' 'Los Angeles' ..., 'Columbus, OH, USA'
 'Milwaukee County, Wisconsin' 'Nigeria,lagos']


In [106]:
# function to clean up Twitter text
def clean_tweet(str):

    str = str.lower() 
    tokens = nltk.word_tokenize(str)                            # tokenize
    tokens = [i for i in tokens if i not in string.punctuation] # remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [i for i in tokens if i not in stop_words]  # remove stop words
    tokens = [i for i in tokens if   not i.isdigit()]    # remove numbers
    tokens = [i for i in tokens if i.isalnum()]          # remove alpha numeric characters

    tokens = list(set(tokens)) # Remove duplicates.
    #print("Length: ", len(tokens))
    
    clean_text = ' '.join(token for token in tokens)
    
    return clean_text

In [107]:
# Add clean tweets field.
#print (tweets_train['text'][0])
#print (clean_tweet(tweets_train['text'][0]))
tweets['clean_tweet']= tweets['text'].apply(lambda s: clean_tweet(s))
tweets['sentiment'] =  tweets['airline_sentiment'].apply(lambda x: 1 if x=='positive' else 0)
tweets.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,negativereason_gold,text,tweet_location,clean_tweet,sentiment
0,neutral,1.0,,,Virgin America,,@VirginAmerica What @dhepburn said.,,dhepburn said virginamerica,0
1,positive,0.3486,,0.0,Virgin America,,@VirginAmerica plus you've added commercials t...,,plus tacky added experience commercials virgin...,1
2,neutral,0.6837,,,Virgin America,,@VirginAmerica I didn't today... Must mean I n...,Lets Play,mean need another today trip take must virgina...,0
3,negative,1.0,Bad Flight,0.7033,Virgin America,,@VirginAmerica it's really aggressive to blast...,,blast amp guests recourse obnoxious faces real...,0
4,negative,1.0,Can't Tell,1.0,Virgin America,,@VirginAmerica and it's a really big bad thing...,,big really bad virginamerica thing,0


In [108]:
# Predictive Modeling, predicted_sentiment based on twitter text.

tweets_train, tweets_test = train_test_split(tweets, test_size=0.10, random_state=42)
print ("Training: ", tweets_train.shape)
print ("Test: ", tweets_test.shape)

# train and test clean tweets.
train_clean_tweet=[]
for t in tweets_train['clean_tweet']:
    train_clean_tweet.append(t)
test_clean_tweet=[]
for t in tweets_test['clean_tweet']:
    test_clean_tweet.append(t)



Training:  (13176, 10)
Test:  (1464, 10)


In [109]:
# Predict Sentiment of the tweet (Positive (1) or Negative/Neutral) 

# Reference: http://scikit-learn.org/stable/modules/feature_extraction.html
# Build training and test features matrix from relevant clean tweet text
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(analyzer = "word")
train_features = v.fit_transform(train_clean_tweet)
test_features  = v.transform(test_clean_tweet)

In [110]:
# Support Vector Machine Classifier

clf = svm.SVC(kernel="rbf", C=0.025, probability=True)
clf.fit(train_features,tweets_train['sentiment'])

print("SVM:")
print("Training data correct classification: ", clf.score(train_features,tweets_train['sentiment']))

print(" ")
print("Cross validation score:")
scores = cross_val_score(clf, test_features,tweets_test['sentiment'], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) 

SVM:
Training data correct classification:  0.83872191864
 
Cross validation score:
[ 0.83673469  0.83617747  0.83617747  0.8390411   0.8390411 ]
Accuracy: 0.84 (+/- 0.00)


In [112]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_features,tweets_train['sentiment'])

print("Tree Classifier:")
print("Training data correct classification: ", clf.score(train_features,tweets_train['sentiment']))

print(" ")
print("Cross validation score:")
scores = cross_val_score(clf, test_features,tweets_test['sentiment'], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) 


Tree Classifier:
Training data correct classification:  0.996508803886
 
Cross validation score:
[ 0.83333333  0.81569966  0.79863481  0.80136986  0.81164384]
Accuracy: 0.81 (+/- 0.01)


In [113]:
clf = KNeighborsClassifier(3)
clf.fit(train_features,tweets_train['sentiment'])

print("K Neighbors:")
print("Training data correct classification: ", clf.score(train_features,tweets_train['sentiment']))

print(" ")
print("Cross validation score:")
scores = cross_val_score(clf, test_features,tweets_test['sentiment'], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) 




K Neighbors:
Training data correct classification:  0.908166363084
 
Cross validation score:
[ 0.78231293  0.75767918  0.76109215  0.79109589  0.80479452]
Accuracy: 0.78 (+/- 0.02)


## References:

(1) Dataset: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

(2) www.nltk.org

(3) http://scikit-learn.org/stable/modules/feature_extraction.html