In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
# Accessing Twitter
import tweepy as tw
import json
# Data Cleaning
import re
import unicodedata
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
# Graphics
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image

# Creating a function to sanitize the tweets
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words or not w.isalpha())
    return tweet

# All my keys are stored in a JSON file, so I pull the keys out of that file to login
with open('twitter_auth.json') as f:
    auth = json.load(f)
    consumer_key = auth['consumer_key']
    consumer_secret = auth['consumer_key_secret']
    access_token = auth['access_key']
    access_token_secret = auth['access_key_secret']
    
# Logging into Twitter through the tweepy package
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth)
# Define the search term and the date_since date as variables
search_words = input("Select a Search Topic: ") # airlines
date_since = "2000-01-01"
# Add some other conditions like removing retweets and searching for verified only accounts
new_search = search_words + " -filter:retweets" #+ " filter:verified"
# Retrieving the results from our search
tweets = tw.Cursor(api.search,
                       q=new_search,
                       lang="en",
                       since=date_since).items(1000)
# Add the results to a dataframe
data = [[tweet.user.screen_name, tweet.text,tweet.favorite_count, tweet.retweet_count] for tweet in tweets]
tweet_text = pd.DataFrame(data=data, 
                    columns=['user','text','likes','retweets'])
tweet_text['tweet'] = tweet_text['text'].map(lambda x: cleaner(x))
# Removing words less than 2 characters in length
string = ""
for i in tweet_text['tweet'].values:
    i = i.split()
    for j in i:
        if len(j) > 2 and j.lower() != "spotted":
            string += j.lower() +" "
# Creating a mask to make the word cloud into a plane
plane_mask = np.array(Image.open('test.jpg'))
# Generating the word cloud
wordcloud = WordCloud(background_color ='#fdfdfd', colormap = 'gist_earth', width=1000, height=1000, mask=plane_mask).generate(string)
plt.figure(figsize=(16,16))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
#plt.title('Words Associated with Air Travel', loc='Center', fontsize=14)
plt.imshow(np.array(Image.open('test.jpg').convert('L')), alpha = 0.05, )
plt.gray()
plt.savefig('aircraft_wordcloud.png', dpi=500)
plt.show()