Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
themeparkanalytics/tweet_parse.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
116 lines (103 sloc)
4.66 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from textblob import TextBlob | |
import re | |
# These functions come from https://github.com/adilmoujahid/Twitter_Analytics/blob/master/analyze_tweets.py and http://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python// | |
def extract_link(text): | |
""" | |
This function removes any links in the tweet - we'll put them back more cleanly later | |
""" | |
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' | |
match = re.search(regex, text) | |
if match: | |
return match.group() | |
return '' | |
def word_in_text(word, text): | |
""" | |
Use regex to figure out which park or ride they're talking about. | |
I might use this in future in combination with my wikipedia scraping script. | |
""" | |
word = word.lower() | |
text = text.lower() | |
match = re.search(word, text, re.I) | |
if match: | |
return True | |
return False | |
def clean_tweet(tweet): | |
''' | |
Utility function to clean tweet text by removing links, special characters | |
using simple regex statements. | |
''' | |
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) | |
def get_tweet_sentiment(tweet): | |
''' | |
Utility function to classify sentiment of passed tweet | |
using textblob's sentiment method | |
''' | |
# create TextBlob object of passed tweet text | |
analysis = TextBlob(clean_tweet(tweet)) | |
# set sentiment | |
if analysis.sentiment.polarity > 0: | |
return 'positive' | |
elif analysis.sentiment.polarity == 0: | |
return 'neutral' | |
else: | |
return 'negative' | |
# Load up the file generated from the Twitter stream capture. | |
# I've assumed it's loaded in a folder called data which I won't upload because git. | |
tweets_data_path = '../data/twitter_themeparks.txt' | |
tweets_data = [] | |
tweets_file = open(tweets_data_path, "r") | |
for line in tweets_file: | |
tweet = json.loads(line) | |
tweets_data.append(tweet) | |
# Check you've created a list that actually has a length. Huzzah! | |
print len(tweets_data) | |
# Turn the tweets_data list into a Pandas DataFrame with a wide section of True/False for which park they talk about | |
# (Adaped from https://github.com/adilmoujahid/Twitter_Analytics/blob/master/analyze_tweets.py) | |
tweets = pd.DataFrame() | |
tweets['user_name'] = map(lambda tweet: tweet['user']['name'] if tweet['user'] != None else None, tweets_data) | |
tweets['followers'] = map(lambda tweet: tweet['user']['followers_count'] if tweet['user'] != None else None, tweets_data) | |
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data) | |
tweets['retweets'] = map(lambda tweet: tweet['retweet_count'], tweets_data) | |
tweets['disney'] = tweets['text'].apply(lambda tweet: word_in_text(r'(disney|magickingdom|epcot|WDW|animalkingdom|hollywood)', tweet)) | |
tweets['universal'] = tweets['text'].apply(lambda tweet: word_in_text(r'(universal|potter)', tweet)) | |
tweets['efteling'] = tweets['text'].apply(lambda tweet: word_in_text('efteling', tweet)) | |
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet)) | |
tweets['sentiment'] = tweets['text'].apply(lambda tweet: get_tweet_sentiment(tweet)) | |
# I want to add in a column called 'park' as well that will list which park is being talked about, and add an entry for 'unknown' | |
# I'm 100% sure there's a better way to do this... | |
park = [] | |
for index, tweet in tweets.iterrows(): | |
if tweet['disney']: | |
park.append('disney') | |
else: | |
if tweet['universal']: | |
park.append('universal') | |
else: | |
if tweet['efteling']: | |
park.append('efteling') | |
else: | |
park.append('unknown') | |
tweets['park'] = park | |
# Create a dataset that will be used in a graph of tweet count by park | |
parks = ['disney', 'universal', 'efteling'] | |
tweets_by_park = [tweets['disney'].value_counts()[True], tweets['universal'].value_counts()[True], tweets['efteling'].value_counts()[True]] | |
x_pos = list(range(len(parks))) | |
width = 0.8 | |
fig, ax = plt.subplots() | |
plt.bar(x_pos, tweets_by_park, width, alpha=1, color='g') | |
# Set axis labels and ticks | |
ax.set_ylabel('Number of tweets', fontsize=15) | |
ax.set_title('Tweet Frequency: disney vs. universal vs. efteling', fontsize=10, fontweight='bold') | |
ax.set_xticks([p + 0.4 * width for p in x_pos]) | |
ax.set_xticklabels(parks) | |
# You need to write this for the graph to actually appear. | |
plt.show() | |
# Create a graph of the proportion of postive, negative and neutral tweets for each park | |
# I have to do two groupby's here because I want proportion within each park, not global proportions. | |
sent_by_park = tweets.groupby(['park', 'sentiment']).size().groupby(level = 0).transform(lambda x: x/x.sum()).unstack() | |
sent_by_park.plot(kind = 'bar' ) | |
plt.title('Tweet Sentiment proportions by park') | |
plt.show() |