In [1]:
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import matplotlib.dates as mdates
import seaborn as sns
sns.set()


# to view all columns
pd.set_option("display.max.columns", None)

In [2]:
#Import the necessary methods from tweepy library  

#install tweepy if you don't have it
#!pip install tweepy
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API

#sentiment analysis package
#!pip install textblob
from textblob import TextBlob

#general text pre-processor
#!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

#tweet pre-processor 
#!pip install tweet-preprocessor
import preprocessor as p

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JohnnnySimple\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
class tweetsearch():
    '''
    This is a basic class to search and download twitter data.
    You can build up on it to extend the functionalities for more 
    sophisticated analysis
    '''
    def __init__(self, cols=None,auth=None):
        #
        if not cols is None:
            self.cols = cols
        else:
            self.cols = ['id', 'created_at', 'source', 'original_text','clean_text', 
                    'sentiment','polarity','subjectivity', 'lang',
                    'favorite_count', 'retweet_count', 'original_author',   
                    'possibly_sensitive', 'hashtags',
                    'user_mentions', 'place', 'place_coord_boundaries']
            
        if auth is None:
            
            #Variables that contains the user credentials to access Twitter API 
            consumer_key = os.environ.get('TWITTER_API_KEY')
            consumer_secret = os.environ.get('TWITTER_API_SECRET')
            access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
            access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
            


            #This handles Twitter authetification and the connection to Twitter Streaming API
            auth = OAuthHandler('TWITTER_API_KEY', 'TWITTER_API_SECRET')
            auth.set_access_token('TWITTER_ACCESS_TOKEN', 'TWITTER_ACCESS_TOKEN_SECRET')
            

        #            
        self.auth = auth
        self.api = tweepy.API(auth,wait_on_rate_limit=True) 
        self.filtered_tweet = ''
        
            

    def clean_tweets(self, twitter_text):

        #use pre processor
        tweet = p.clean(twitter_text)

         #HappyEmoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        #combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(tweet)
        #after tweepy preprocessing the colon symbol left remain after      
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        #looping through conditions
        filtered_tweet = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)

        return ' '.join(filtered_tweet)            

    def get_users_data(self, lst):
        cols = ["name", "screen_name", "created_at", "followers_count", "friends_count", "statuses_count"]
        data_df = pd.DataFrame(columns=cols)
        if len(lst) > 0:
            for target in lst:
#                 print("Getting data for " + target)
                item = self.api.get_user(target)
                data_df.loc[len(data_df)] = [item.name, item.screen_name, item.created_at, item.followers_count,
                                            item.friends_count, item.statuses_count]
#                 print("name: " + item.name)
#                 print("screen_name: " + item.screen_name)
#                 print("description: " + item.description)
#                 print("statuses_count: " + str(item.statuses_count))
#                 print("friends_count: " + str(item.friends_count))
#                 print("followers_count: " + str(item.followers_count))
#                 print("created_at: " + str(item.created_at))
#                 print("favourites_count:" + str(item.favourites_count))
                
        return data_df
    
    
    def retweet_data(self, username):
        print(self.api.retweets_of_me(username))
        
        
    def get_user_tweets(self, username):
        #100 tweets to be extracted
        number_of_tweets = 100
        tweets = self.api.user_timeline(screen_name=username)
        
        tmp = []
        
        # create array of tweet information: username,  
        # tweet id, date/time, text 
        tweets_for_csv = [tweet.text for tweet in tweets] # CSV file created  
        for j in tweets_for_csv: 
  
            # Appending tweets to the empty array tmp 
            tmp.append(j)  
  
        # Printing the tweets 
        print(tmp)
        return tmp
    
    def get_all_users_tweets(self, lst):
        cols = ['id', 'screen_name', 'created_at', 'tweet_id', 'original_text', 
                    'retweet_count', 'hashtags', 'user_mentions', 'likes'] 
        df_data = pd.DataFrame(columns=cols)
        for i in lst:
            counter = 0
            for status in tweepy.Cursor(self.api.user_timeline, screen_name=i, tweet_mode='extended').items(100):
                counter =  counter + 1
                    
                df_data.loc[len(df_data)] = [status.user.id, status.user.screen_name, status.created_at, status.id,
                                             status.full_text, status.retweet_count, status.entities['hashtags'],
                                             status.entities['user_mentions'], status.favorite_count]
                
        return df_data
    
    def get_replies(self, names, ids):
        replies_list = []
        for name, tweet_id in zip(names, ids):
            replies_count = 0
            for tweet in tweepy.Cursor(self.api.search, q='to:'+name, result_type='recent', timeout=999999).items(1000):
                if hasattr(tweet, 'in_reply_to_status_id_str'):
                    if (tweet.in_reply_to_status_id_str==tweet_id):
                        replies_count = replies_count + 1
            replies_list.append(replies_count)
        
#         for i in range(len(top)):
#             replies_count = 0
#             for tweet in tweepy.Cursor(self.api.search, q='to:'+top['screen_name'].loc[i], timeout=999999).items(10):
#                 if hasattr(tweet, 'in_reply_to_status_id_str'):
#                     if(tweet.in_reply_to_status_id_str == top['tweet_id'].loc[i]):
#                         replies_count = replies_count + 1
#             replies_list.append(replies_count)
        return replies_list
        
        
    def get_active_users(self, lst):
        
        new_lst = []
        for handle in lst:
            try:
                u = self.api.get_user(handle)
                new_lst.append(u.screen_name)
            except Exception:
                pass
        return new_lst
    
    
    def get_likes(self, username):
        favorites = self.api.favorites(username)
        for status in favorites:
            print(status._json['favorite_count'])
    
    
#     def get_comments(self, lst):
            
    
    
    def get_tweets(self, keyword, csvfile=None):
        
        
        df = pd.DataFrame(columns=self.cols)
        
        if not csvfile is None:
            #If the file exists, then read the existing data from the CSV file.
            if os.path.exists(csvfile):
                df = pd.read_csv(csvfile, header=0)
            

        #page attribute in tweepy.cursor and iteration
        for page in tweepy.Cursor(self.api.search, q=keyword,count=100, include_rts=False,tweet_mode='extended').pages(10):

            # the you receive from the Twitter API is in a JSON format and has quite an amount of information attached
            for status in page:
                
                new_entry = []
                status = status._json
                
                #filter by language
                #if status['lang'] != 'en':
                #    continue

                
                #if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]
                    #
                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue

                #calculate sentiment
                filtered_tweet = self.clean_tweets(status['full_text'])
                blob = TextBlob(filtered_tweet)
                Sentiment = blob.sentiment     
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['full_text'], filtered_tweet, 
                              Sentiment,polarity,subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])

                try:
                    is_sensitive = status['possibly_sensitive']
                except KeyError:
                    is_sensitive = None

                new_entry.append(is_sensitive)

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)

                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_tweet_df, ignore_index=True)

        #
        df['timestamp'] = df.created_at.map(pd.Timestamp)
        df = df.sort_values('timestamp').set_index('timestamp')
        df = df.drop('id',axis=1)
        
        if not csvfile is None:
            #save it to file
            df.to_csv(csvfile, columns=self.cols, index=True, encoding="utf-8")
            

        return df
    
    def get_users(self, username):
        user_objects = self.api.lookup_users(Name=username)
        user_id = [user.id_str for user in user_objects]
        return user_id

In [24]:
ts = tweetsearch()

In [25]:
ts.get_users('johnson')

TypeError: lookup_users() got an unexpected keyword argument 'Name'