# Twitter Code Review
#### Goals: 
- pull twitter data using roshan's code 
- pull twitter data using wisdm code 
- compare both 
- identify features 

In [10]:
import tweepy
import json
import pandas as pd
from datetime import datetime
from datetime import timedelta
#import preprocessor as pre
import regex as re
import time


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Roshan's code

In [11]:
import tweepy
import configparser
import pandas as pd
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#analyser = SentimentIntensityAnalyzer()

# read configs
config = configparser.ConfigParser()
config.read(r"C:\Users\Dennis\Desktop\Wisdm\wisdmai\Data\.Archive\Twitter\config.ini")

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# user tweets
# user = 'veritasium'
# limit=300

# tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)

# search tweets
keywords = '$TSLA'
limit = 100

tweets = tweepy.Cursor(api.search_tweets, q=keywords,
                       count=100, tweet_mode='extended').items(limit)

# tweets = api.user_timeline(screen_name=user, count=limit, tweet_mode='extended')

# create DataFrame
columns = ['Time', 'User', 'Tweet']
data = []

for tweet in tweets:
    # if tweet.user.verified == True:
    data.append([tweet.created_at, tweet.user.screen_name,
                tweet.full_text])

df = pd.DataFrame(data, columns=columns)

#df.to_csv('tweets.csv')

In [12]:
df.head()

Unnamed: 0,Time,User,Tweet
0,2022-11-05 23:36:30+00:00,JosephHoskins19,RT @cvalente28: I do not think it has sunk in ...
1,2022-11-05 23:35:36+00:00,TeslaGoesPlaid,RT @DoctorJack16: I’m looking forward to the d...
2,2022-11-05 23:34:33+00:00,FreckmanT,RT @alex_avoigt: It's strongly bullish that wh...
3,2022-11-05 23:34:25+00:00,EbertJerald,Join Trade-Idea Daily Support Sessions Live! \...
4,2022-11-05 23:34:04+00:00,truedm8,@elonmusk @TomFitton @Twitter @yoyoel @Judicia...


# Wisdm Code 

In [41]:
# Function: preprocess tweet text
def cleantweet(tweet):
    #https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e
    #cleantweet = pre.clean(tweet)
    cleantweet = tweet.lower()
    cleantweet = re.sub('\d+', '', cleantweet)
    cleantweet = re.sub(r'[^\w\s]', '', cleantweet)  
    return cleantweet
    

In [14]:
# Function: identify tweet quality
def tweetquality(user_verified, favorite_count, retweet_count):
    if user_verified == True or favorite_count > 100 or retweet_count > 10:
        return True
    else:
        return False

In [15]:
# Function: twitter search pagination and rate limit handling
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except StopIteration:
            break
        except tweepy.error:
            print('Reached rate limit. Sleeping for >15 minutes')
            time.sleep(15 * 61)

In [36]:
# Function: obtain query list from ticker list
def getqueries(tickers): #returns a list of query strings
    queries = []
    for ticker in tickers:
        querylist = []
        querylist.append('$'+ticker)
        othertickers=[]
        [othertickers.append(t) for t in tickers]
        othertickers.remove(ticker)
        for otherticker in othertickers:
            querylist.append(' -$'+otherticker) 
        tickerquery = ''.join([str(q) for q in querylist]) + ' -filter:retweets' #Exclude retweets
        queries.append(tickerquery)
    return queries


tickers = ['TSLA', 'GME', 'SPY']
getqueries(tickers)[0]
# first ticker is getting queried, tickers with '-' in front are not queried

'$TSLA -$GME -$SPY -filter:retweets'

In [51]:
def get_tweets(query, since_id, until_date, max_tweets):
    search = limit_handled(tweepy.Cursor(api.search_tweets,
                                            q = query, 
                                            count = 100,
                                            tweet_mode='extended',
                                            lang='en',
                                            result_type="recent",
                                            ).items(max_tweets))

    dftweets = pd.DataFrame()
    #cycle through generator 
    for tweet in search:
        dftweets = pd.concat([dftweets, pd.json_normalize(tweet._json)])

    print(query,'\n','# tweets collected:', len(dftweets), '\n')

    try:
        dftweets['full_text_preprocessed'] = dftweets.apply(lambda row : cleantweet(row['full_text']), axis = 1)
        dftweets['quality'] = dftweets.apply(lambda row : tweetquality(row['user.verified'], row['favorite_count'], row['retweet_count']), axis = 1)
        dftweets['num_cashtags'] = dftweets.apply(lambda row : str(row['entities.symbols']).count('text'), axis = 1)
        dftweets['ticker'] = dftweets.apply(lambda row : query.split()[0], axis = 1)
        dftweets['query_params'] = dftweets.apply(lambda row : 'query:'+query+' since_id:'+str(since_id)+' until_date:'+str(until_date)+' max_tweets:'+str(max_tweets), axis = 1)
        #apply filter
        dftweets = dftweets[dftweets.num_cashtags == 1]
        # return output
        return dftweets
            
    except Exception:
        #for debugging purposes
        print('\n', 'preprocessing broke!!!!!', '\n')
        pass
    finally:
        print('# tweets (filtered):',len(dftweets),'\n') 
    


In [52]:
#will be provided by scoping algorithm 
tickers = ['TSLA']
#'TWTR','AMC','SPY','HMHC','DWAC','AMD','SST','AAPL','AMZN','NVDA','TLRY','NFLX','QQQ','PLTR','FB','BABA','VIX','SOFI','TEAM','RBLX','RSX','WISH','OSU']
max_tweets = 100 #max per ticker

print('tickers:', tickers)
print('max_tweets:', max_tweets, '\n')

# Set date/time parameters
cur_time_utc = datetime.utcnow().replace(microsecond=0)
until_date = cur_time_utc.strftime("%Y-%m-%d") #"2022-04-03"#"2022-04-25"#
from_date =  cur_time_utc - timedelta(days=1) #"2022-04-02"#"2022-04-24"#
from_date = from_date.strftime("%Y-%m-%d")

print('from date:', from_date) 
print('until date:', until_date, '\n')


# API Authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

#wrapper for Twitter API 
api = tweepy.API(auth, wait_on_rate_limit=True)

#get ticker queries 
queries = getqueries(tickers)
print('first queury:', queries[0], '\n')


# Find the last tweet id for from_date (need this to filter on from_date)
search_since_id = limit_handled(tweepy.Cursor(api.search_tweets, 
                                                q = 'A', #query does not matter
                                                tweet_mode = 'extended', 
                                                lang = 'en', 
                                                result_type = 'recent', 
                                                until = from_date
                                                ).items(1))

since_id  = [tweet._json['id'] for tweet in search_since_id][0]
print('since last tweet id:', since_id, '\n')

#tweets dataframe for all tweets (unfiltered) - might be able to store seperately 
dftweets = pd.DataFrame()
#pulling tweets 


for query in queries:
    new_tweets = get_tweets(query, since_id, until_date, max_tweets)
    dftweets = pd.concat([dftweets, new_tweets], axis = 0, ignore_index = True)



tickers: ['TSLA']
max_tweets: 100 

from date: 2022-11-04
until date: 2022-11-05 

first queury: $TSLA -filter:retweets 

since last tweet id: 1588320116512473089 

$TSLA -filter:retweets 
 # tweets collected: 100 

# tweets (filtered): 43 



In [54]:
dftweets[['ticker','created_at','full_text_preprocessed','user.verified','favorite_count','retweet_count','quality','entities.symbols','num_cashtags','query_params']].head()

Unnamed: 0,ticker,created_at,full_text_preprocessed,user.verified,favorite_count,retweet_count,quality,entities.symbols,num_cashtags,query_params
0,$TSLA,Sat Nov 05 23:51:54 +0000 2022,elonmusk wsbchairman billymk twitter better be...,False,0,0,False,"[{'text': 'tsla', 'indices': [94, 99]}]",1,query:$TSLA -filter:retweets since_id:15883201...
1,$TSLA,Sat Nov 05 23:51:35 +0000 2022,at this moment i feel like elon shouldve staye...,False,1,0,False,"[{'text': 'TSLA', 'indices': [65, 70]}]",1,query:$TSLA -filter:retweets since_id:15883201...
2,$TSLA,Sat Nov 05 23:51:23 +0000 2022,avatrode he knows something tsla,False,0,0,False,"[{'text': 'TSLA', 'indices': [30, 35]}]",1,query:$TSLA -filter:retweets since_id:15883201...
3,$TSLA,Sat Nov 05 23:50:24 +0000 2022,raytesla he needs to pick up tsla as they dont...,False,1,0,False,"[{'text': 'TSLA', 'indices': [31, 36]}]",1,query:$TSLA -filter:retweets since_id:15883201...
4,$TSLA,Sat Nov 05 23:47:52 +0000 2022,tsla awaiting short signal based off signals...,False,0,0,False,"[{'text': 'TSLA', 'indices': [0, 5]}]",1,query:$TSLA -filter:retweets since_id:15883201...


# Feature Selection

In [55]:
dftweets.shape

(43, 167)

In [58]:
# https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/get-search-tweets#example-response

for feature in dftweets.columns:
    print(feature)

created_at
id
id_str
full_text
truncated
display_text_range
source
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
in_reply_to_screen_name
geo
coordinates
place
contributors
is_quote_status
retweet_count
favorite_count
favorited
retweeted
possibly_sensitive
lang
entities.hashtags
entities.symbols
entities.user_mentions
entities.urls
entities.media
extended_entities.media
metadata.iso_language_code
metadata.result_type
user.id
user.id_str
user.name
user.screen_name
user.location
user.description
user.url
user.entities.url.urls
user.entities.description.urls
user.protected
user.followers_count
user.friends_count
user.listed_count
user.created_at
user.favourites_count
user.utc_offset
user.time_zone
user.geo_enabled
user.verified
user.statuses_count
user.lang
user.contributors_enabled
user.is_translator
user.is_translation_enabled
user.profile_background_color
user.profile_background_image_url
user.profile_background_image_url_https
user.profile_