# Author - Kevin Abraham

In [None]:
# Import all necessary python libraries/pacakges
import re
#import tweepy
#from tweepy.streaming import StreamListener
#from tweepy import OAuthHandler
#from tweepy import Stream
from textblob import TextBlob
import csv 
import pandas as pd
import json
import datetime as dt

import os,sys
import time

In [None]:
def load_api():
    # Function that loads the twitter API after authorizing the user

    access_token = "1058460791xxxxxxxxxxxxxxx"
    access_token_secret = "1058460791xxxxxxxxxxxxxxx"
    consumer_key = "1058460791xxxxxxxxxxxxxxx"
    consumer_secret = "1058460791xxxxxxxxxxxxxxx"

    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    # load the twitter API via tweepy
    return tweepy.API(auth)


In [None]:
def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
    #  Function that takes in a search string 'query', the maximum
    #  number of tweets 'max_tweets', and the minimum (i.e., starting)
    #  tweet id. It returns a list of tweepy.models.Status objects. '''

    searched_tweets = []
    while len(searched_tweets) < max_tweets:
        remaining_tweets = max_tweets - len(searched_tweets)
        try:
            new_tweets = api.search(q=query, count=remaining_tweets, since_id=str(since_id), max_id=str(max_id-1))
#                                   geocode=geocode)
            print('found',len(new_tweets),'tweets')
            if not new_tweets:
                print('no tweets found')
                break
            searched_tweets.extend(new_tweets)
            max_id = new_tweets[-1].id
        except tweepy.TweepError:
            print('exception raised, waiting 15 minutes')
            print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
            time.sleep(15*60)
            break # stop the loop
    return searched_tweets, max_id

In [None]:
def get_tweet_id(api, date='', days_ago=9, query='a'):
     #   Function that gets the ID of a tweet. This ID can then be
     #   used as a 'starting point' from which to search. The query is
     #   required and has been set to a commonly used word by default.
     #   The variable 'days_ago' has been initialized to the maximum
     #   amount we are able to search back in time (9)

    if date:
        # return an ID from the start of the given day
        td = date + dt.timedelta(days=1)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        tweet = api.search(q=query, count=1, until=tweet_date)
    else:
        # return an ID from __ days ago
        td = dt.datetime.now() - dt.timedelta(days=days_ago)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        # get list of up to 10 tweets
        tweet = api.search(q=query, count=10, until=tweet_date)
        print('search limit (start/stop):',tweet[0].created_at)
        # return the id of the first tweet in the list
        return tweet[0].id

In [None]:
def write_tweets(tweets, filename):
    # Function that appends tweets to a file. '''

    with open(filename, 'a') as f:
        for tweet in tweets:
            json.dump(tweet._json, f)
            f.write('\n')

In [None]:
def main():
    #    This is a script that continuously searches for tweets
    #    that were created over a given number of days. The search
    #    dates and search phrase can be changed below. '''



    # search variables: 
    search_phrases = ['#GreyCup','Grey Cup', 'GreyCup']
    time_limit = 1.5                           # runtime limit in hours
    max_tweets = 100                           # number of tweets per search (will be
                                               # iterated over) - maximum is 100
    min_days_old, max_days_old = 1, 9          # search limits e.g., from 1 to 9 days old
                                               # gives current weekday from last week,
                                               # min_days_old=0 will search from right now
    USA = '39.8,-95.583068847656,2500km'       # this geocode includes nearly all American
                                               # states (and a large portion of Canada)
    

    # loop over search items,
    # creating a new file for each
    for search_phrase in search_phrases:

        print('Search phrase =', search_phrase)

        # other variables 
        name = search_phrase.split()[0]
        json_file_root = name + '/'  + name
        os.makedirs(os.path.dirname(json_file_root), exist_ok=True)
        read_IDs = False
        
        # open a file in which to store the tweets
        if max_days_old - min_days_old == 1:
            d = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
        else:
            d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
            d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
                  d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
        json_file = json_file_root + '_' + day + '.json'
        if os.path.isfile(json_file):
            print('Appending tweets to file named: ',json_file)
            read_IDs = True
        
        # authorize and load the twitter API
        api = load_api()
        
        # set the 'starting point' ID for tweet collection
        if read_IDs:
            # open the json file and get the latest tweet ID
            with open(json_file, 'r') as f:
                lines = f.readlines()
                max_id = json.loads(lines[-1])['id']
                print('Searching from the bottom ID in file')
        else:
            # get the ID of a tweet that is min_days_old
            if min_days_old == 0:
                max_id = -1
            else:
                max_id = get_tweet_id(api, days_ago=(min_days_old-1))
        # set the smallest ID to search for
        since_id = get_tweet_id(api, days_ago=(max_days_old-1))
        print('max id (starting point) =', max_id)
        print('since id (ending point) =', since_id)
        


        # tweet gathering loop  '''
        start = dt.datetime.now()
        end = start + dt.timedelta(hours=time_limit)
        count, exitcount = 0, 0
        while dt.datetime.now() < end:
            count += 1
            print('count =',count)
            # collect tweets and update max_id
            tweets, max_id = tweet_search(api, search_phrase, max_tweets,
                                          max_id=max_id, since_id=since_id,
                                          geocode=USA)
            # write tweets to file in JSON format
            if tweets:
                write_tweets(tweets, json_file)
                exitcount = 0
            else:
                exitcount += 1
                if exitcount == 3:
                    if search_phrase == search_phrases[-1]:
                        sys.exit('Maximum number of empty tweet strings reached - exiting')
                    else:
                        print('Maximum number of empty tweet strings reached - breaking')
                        break




In [None]:
# Run program from MAIN (starting point)
if __name__ == "__main__":
    main()

In [None]:
def populate_tweet_df(tweets):
    df = pd.DataFrame()
 
    df['text'] = list(map(lambda tweet: tweet['text'], tweets))
 
    df['location'] = list(map(lambda tweet: tweet['user']['location'], tweets))
    
    df['lang'] = list(map(lambda tweet: tweet['lang'], tweets))
 
    df['country_code'] = list(map(lambda tweet: tweet['place']['country_code']
                                  if tweet['place'] != None else None, tweets))
 
    df['long'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][0]
                        if tweet['coordinates'] != None else None, tweets))
 
    df['latt'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][1]
                        if tweet['coordinates'] != None else None, tweets))
 
    return df

In [None]:
# Local path for tweets collected for #GreyCup & Grey Cup
local_path_1='C:/Users/Owner/Desktop/2018 - Mcmaster Data Analytics App/Courses/Fall 2018/BDA 102/Table of Contents/Lab Assignments/Project Files/Files From VM/GreyCup - Project Files/#GreyCup/#GreyCup_2018-11-21_to_2018-11-28.json'
local_path_2='C:/Users/Owner/Desktop/2018 - Mcmaster Data Analytics App/Courses/Fall 2018/BDA 102/Table of Contents/Lab Assignments/Project Files/Files From VM/GreyCup - Project Files/Grey/Grey_2018-11-21_to_2018-11-28.json'

In [None]:
# Read in the jason files into tweets list

tweet_files = [local_path_1, local_path_2]
tweets = []
for file in tweet_files:
    with open(file, 'r') as f:
        for line in f.readlines():
            tweets.append(json.loads(line))

In [None]:
greycup_tweets = populate_tweet_df(tweets)

In [None]:
len(greycup_tweets)

In [None]:
greycup_tweets.head()

In [None]:
greycup_tweets.dtypes

In [None]:
# Will require you to install pyproj and basemap
# Follow instructions here --> https://www.lfd.uci.edu/~gohlke/pythonlibs/
# Skip down to 'Basemap' title and install latest version 37/64bit OS pyproj + basemap

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

In [None]:
# plot the blank world map
my_map = Basemap(projection='merc', lat_0=50, lon_0=-100,
                     resolution = 'h', area_thresh = 5000.0,
                     llcrnrlon=-140, llcrnrlat=-55,
                     urcrnrlon=160, urcrnrlat=70)
# set resolution='h' for high quality
 
# draw elements onto the world map
my_map.drawcountries()
#my_map.drawstates()
my_map.drawcoastlines(antialiased=False, linewidth=0.005)
 
# add coordinates as red dots
longs = list(greycup_tweets.loc[(greycup_tweets.long != None)].long)
latts = list(greycup_tweets.loc[greycup_tweets.latt != None].latt)
x, y = my_map(longs, latts)
my_map.plot(x, y, 'ro', markersize=6, alpha=0.5)
 
plt.show()

In [None]:
# Copy over greycup_tweets to new dataframe 'tweets' for simplicity
tweets = greycup_tweets

In [None]:
type(tweets)

In [None]:
tweets.tail(n=10)

In [None]:
import re

In [None]:
# Count top languages in which tweets were written
tweets_by_lang = tweets['lang'].value_counts()
tweets_by_lang[:3]

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot graphically top languages in which the tweets were written
tweets_by_lang = tweets['lang'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 3 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:3].plot(ax=ax, kind='bar', color='red')

In [None]:
# Count top 3 user location from which tweets were sent
tweets_by_location = tweets['location'].value_counts()
tweets_by_location[2:5]

In [None]:
# Plot top user locations from which the tweets were sent
tweets_by_location = tweets['location'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Countries', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 3 User Locations', fontsize=15, fontweight='bold')
tweets_by_location[2:5].plot(ax=ax, kind='bar', color='orange')

In [None]:
# Count top 2 countries from which tweets were sent
tweets_by_country = tweets['country_code'].value_counts()
tweets_by_country[0:2]

In [None]:
# Plot top countries from which the tweets were sent
tweets_by_country = tweets['country_code'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Countries', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 2 Countries', fontsize=15, fontweight='bold')
tweets_by_country[0:2].plot(ax=ax, kind='bar', color='green')

In [None]:
# Create function to return boolean result if a word is found in the text
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [None]:
tweets.head()

In [None]:
# Add additional columns to the 'tweets' data frame:

# Begin sorting results by relevancy
# This list CAN vary depending on the search 'texts'
tweets['#GreyCup'] = tweets['text'].apply(lambda tweet: word_in_text('#GreyCup', tweet))
tweets['Grey Cup'] = tweets['text'].apply(lambda tweet: word_in_text('Grey Cup', tweet))


In [None]:
tweets.head(n=10)

In [None]:
# Counts for each reference of the given topic
print (tweets['#GreyCup'].value_counts()[True])
print (tweets['Grey Cup'].value_counts()[True])


In [None]:
# Common words before ANY text cleaning
# Top 5
freq = pd.Series(' '.join(tweets['text']).split()).value_counts()[:5]
freq

In [None]:
# Only extract relevant tweets containing useful keywords

# tweets.drop(['Unnamed: 5', 'Unnamed: 6', 'sepal length.1', 'Unnamed: 8'],axis=1,inplace=True)
tweets['Relevant'] = tweets['text'].apply(lambda tweet: word_in_text('#GreyCup', tweet) or word_in_text('Grey Cup', tweet))

# Print count for of relevant tweets
print ('Relevant Tweets: ' + str(tweets['Relevant'].value_counts()[True]))

In [None]:
tweets.head(n=6)

In [None]:
# Use function if required to clean text
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# Function to remove hashtags, URLs, mentions, punctuations, RTs, whitespace 
def clean_tweet(tweet):
    
    #tweet = re.sub('http\S+\s*', '', tweet)  # remove URLs
    #tweet = re.sub('RT|cc', '', tweet)       # remove RT and cc
    #tweet = re.sub('RT', '', tweet)          # remove RT only
    #tweet = re.sub('#\S+', '', tweet)        # remove hashtags
    #tweet = re.sub('@\S+', '', tweet)        # remove mentions    
    #tweet = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', tweet) # remove punctuations     
    #tweet = re.sub('\s+', ' ', tweet)        # remove extra whitespace
    #tweet = remove_emoji(tweet)              # remove any emoticons/images/symbols/flags/pics
    
    # [Do not REMOVE numbers - will need all references to 2018 in text]
    #tweet = re.sub('[0-9_]', '', tweet)      # remove numbers 
    
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|cc)", " ", tweet).split()) 
    return tweet

In [None]:
# hold length of tweets dataframe
length = len(tweets)
length

In [None]:
# loop through every 'text' record and clean tweets

i=0

for tweet in tweets['text']:
    tweet = clean_tweet(tweet).lower()
    tweets.loc[i,'text'] = tweet
    if(i==length):
        break
    else:
        i = i+1


In [None]:
tweets.head(n=10)

In [None]:
# Generate heat map of tweets dataframe for any column(s) with NaNs/None/Nulls
import seaborn as sns
sns.heatmap(tweets.isnull(),yticklabels=False,cbar=False,cmap='gist_rainbow_r')

In [None]:
# Total # of NaNs/NULL value for each Columns/Records in data frame

null_columns=tweets.columns[tweets.isnull().any()]
tweets[null_columns].isnull().sum() 

In [None]:
# Drop column(s) if majori-ty are None/Null/NaNs
# NOT Required anymore
tweets.drop(['country_code','long','latt'], axis = 1, inplace = True)


In [None]:
tweets.head(n=6)

In [None]:
# Common words AFTER cleaning text (No STOPWORDS removed)
# Top 5
freq = pd.Series(' '.join(tweets['text']).split()).value_counts()[:5]
freq

In [None]:
# AFter tweets have been cleaned of hashtags, URLs, mentions, punctuations, RTs, whitespace.
# Perform sentiment analysis

def get_tweet_sentiment(tweet): 

    #Utility function to classify sentiment of passed tweet 
    #using textblob's sentiment method 

    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 4  #positive
    elif analysis.sentiment.polarity == 0: 
        return 2  #neutral
    else: 
        return 0  #negative

In [None]:
# Add new column 'sentiment' --> Class attribute for sentiment analysis
tweets['sentiment'] = tweets['text'].apply(lambda tweet: get_tweet_sentiment(tweet))

In [None]:
tweets.head()

In [None]:
print("\n")
# percentage of positive tweets 
ptweets = [tweet for tweet in tweets['sentiment'] if tweet == 4] 
print("Positive tweets percentage: {} %".format(100*len(ptweets)/len(tweets))) 

# percentage of negative tweets 
ntweets = [tweet for tweet in tweets['sentiment'] if tweet == 0] 
print("Negative tweets percentage: {} %".format(100*len(ntweets)/len(tweets))) 

# percentage of neutral tweets 
neutweets = [tweet for tweet in tweets['sentiment'] if tweet == 2] 
print("Neutral tweets percentage: {} %".format(100*len(neutweets)/len(tweets))) 

print("\n")

In [None]:
# Plot distribution of sentiments of the analysed tweets

prg_langs = ['Positive', 'Negative', 'Neutral']
tweets_by_prg_lang = [len(ptweets), len(ntweets), len(neutweets)]

x_pos = list(range(3))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='y')

# Setting axis labels and ticks
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Ranking: Positive vs Negative vs Neutral', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0.4 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download these packages if required
#nltk.download('punkt')
#nltk.download('stopwords')

In [None]:
stop = stopwords.words('english')
stop

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer
# lst=LancasterStemmer() # more aggressive - so avoid using this stemming technique

In [None]:
def stem_words(words):
    stemmer = PorterStemmer() #not as aggressive
    stems=[]
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

In [None]:
# Remove stopwords and stem results
i=0
tokenized_tweet = []

for tweet in tweets['text']:
    tokenized = word_tokenize(tweet)
    stems = stem_words(tokenized)
    tweets.loc[i,'text'] = " ".join(list(x for x in stems if x not in stop))
    if(i==length):
        break
    else:
        i=i+1

In [None]:
# Common words AFTER ALL text cleaning is complete (Stemmed text)
# Top 5
freq = pd.Series(' '.join(tweets['text']).split()).value_counts()[:4]
freq = freq.to_frame()
freq.columns = ['count']
freq

In [None]:
prg_langs = ['greycup', 'grey', 'cup', 'CFL']
tweets_by_prg_lang = [freq['count'][0],freq['count'][1], freq['count'][2], freq['count'][3]]

x_pos = list(range(4))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='r')

# Setting axis labels and ticks
ax.set_ylabel('Number of Occurences', fontsize=15)
ax.set_title('- Common Words Ranking -', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0.4 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()

In [None]:
# pip install wordcloud - python terminal windoe
from wordcloud import WordCloud

In [None]:
def wordcloud(tweets,col):
    #stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color="white",stopwords=stop,random_state = 2016).generate(" ".join([i for i in tweets[col]]))
    plt.figure( figsize=(15,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("Good Morning Datascience+")

In [None]:
# WordCloud of most commonly occuring words
wordcloud(tweets,'text')

In [None]:
tweets.head()

In [None]:
len(tweets)

In [None]:
# Split up the data into a training and test set
from sklearn.model_selection import train_test_split

In [None]:
# Class attributes for sentiment analysis
# Display last 12
tweets['sentiment'].head(n=12)

In [None]:
tweets['text'].head()

In [None]:
# Seperate train and test dataset (sentiment - class attribute)
X = tweets.drop('sentiment', axis = 1)
y = tweets['sentiment']

In [None]:
# Split dataset 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 5) # 80% - 20% split

In [None]:
X_train.head()

In [None]:
# Type = Series
y_train.head()

In [None]:
X_train = list(X_train['text'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words = 'english')

In [None]:
text = vec.fit_transform(X_train)

In [None]:
print(vec.vocabulary_)

In [None]:
text.todense()

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
X_test = list(X_test['text'])

In [None]:
type(X_test)

In [None]:
vec.transform(X_test).toarray()

In [None]:
import pandas as pd

In [None]:
# Create dataframe for train dataset (X_test, y_test)
df = pd.DataFrame(vec.fit_transform(X_train).toarray(), columns=vec.get_feature_names())
df

In [None]:
# Create dataframe for Test data set (X_test, y_test)
test_df = pd.DataFrame(vec.transform(X_test).toarray(), columns = vec.get_feature_names())
test_df

In [None]:
df_train = y_train.to_frame()
df_train.columns = ['class']

df_test = y_test.to_frame()
df_test.columns = ['class']

In [None]:
x_train = df
y_train = df_train['class']

x_test = test_df
y_test = df_test['class']

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB

In [None]:
# MODEL 1:
# Build Naive_Bayes supervised learning model and compute accuracy against test dataset

algorithm_a = MultinomialNB()

In [None]:
# fit model with train dataset
algorithm_a.fit(x_train, y_train)

In [None]:
output = algorithm_a.predict(x_test)

In [None]:
# Test model accuracy with Test dataset (class, output)
metrics.accuracy_score(y_test, output)

In [None]:
# Model predicted confusion matrix 
metrics.confusion_matrix(y_test,output)

In [None]:
# MODEL 2:
# Build support vector machines (SVM) learning model

In [None]:
tweets.head()

In [None]:
#LINEAR KERNEL (SVM) Learning Model

import sklearn.svm as svm
clf = svm.SVC(kernel='linear')

In [None]:
# Train classifier 
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test) 

In [None]:
# Performs slightly better that Naive_Bayes

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

In [None]:
# MODEL 3:

In [None]:
# Polynomial Kernel (SVM)
from sklearn.svm import SVC  
svclassifier = SVC(kernel='poly', degree=8)  
svclassifier.fit(x_train, y_train) 

In [None]:
y_pred = svclassifier.predict(x_test)  

In [None]:
#Compare confusion matrix results:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 
