# Load data

In [None]:
import pandas as pd

extracted_relevant_tweets_path = "../data/relevant_tweets_twitterfeed.csv"
filtered_stream_tweets_path = "../data/filtered_stream_tweets.csv"
eszters_tweets_words_path = "../data/tweets1.csv"
eszters_tweets_hashtags_path = "../data/tweets2.csv"

extracted_relevant_tweets = pd.read_csv(extracted_relevant_tweets_path)
filtered_stream_tweets = pd.read_csv(filtered_stream_tweets_path)
eszters_tweets_words = pd.read_csv(eszters_tweets_words_path)
eszters_tweets_hashtags = pd.read_csv(eszters_tweets_hashtags_path)

In [None]:
print("Number of tweets:")
print("US geolocated tweets, filtered:\t%d" % len(extracted_relevant_tweets.index))
print("Filtered Stream tweets:\t\t%d" % len(filtered_stream_tweets.index))
print("Eszter's Tweets containing Uber related words: %d" % len(eszters_tweets_words.index))
print("Eszter's Tweets containing Uber related hashtags: %d" % len(eszters_tweets_hashtags.index))

## Most common hashtags

In [None]:
import re
from collections import Counter
from pprint import pprint

def hashtags_counter(df):
    all_hashtags = []
    hashtag_reg = r"(#\w+)"
    for index, data in df.iterrows():
        text = data['text'].lower()
        hashtags = re.findall(hashtag_reg, text)
        if hashtags:
            for hashtag in hashtags:
                all_hashtags.append(hashtag)
    return Counter(all_hashtags)


extracted_relevant_tweets_counts = hashtags_counter(extracted_relevant_tweets)
filtered_stream_tweets_counts = hashtags_counter(filtered_stream_tweets)
eszters_tweets_words_counts = hashtags_counter(eszters_tweets_words)
eszters_tweets_hashtags_counts = hashtags_counter(eszters_tweets_hashtags)

print("10 most common hashtags:\n")
print("US geolocated tweets (%s), filtered:" % len(extracted_relevant_tweets.index))
print(*extracted_relevant_tweets_counts.most_common(10), sep="\n")
print("\n")
print("Filtered stream tweets (%s):" % len(filtered_stream_tweets.index))
print(*filtered_stream_tweets_counts.most_common(10), sep="\n")
print("\n")
print("Eszters Tweets containing Uber related words (%s):\n" % len(eszters_tweets_words.index))
print(*eszters_tweets_words_counts.most_common(10), sep="\n")
print("\n")
print("Eszters Tweets containing Uber related hashtags (%s):\n" % len(eszters_tweets_hashtags.index))
print(*eszters_tweets_hashtags_counts.most_common(10), sep="\n")

## Most common hashtag pairs

In [None]:
import itertools

def hashtag_pairs_counter(df):
    hashtag_pairs = []
    hashtag_reg = r"(#\w+)"
    for index, data in df.iterrows():
        text = data['text'].lower()
        hashtags = re.findall(hashtag_reg, text)
        pairs = set(itertools.combinations(hashtags, 2))
        if pairs:
            for pair in pairs:
                hashtag_pairs.append(pair)
    return Counter(hashtag_pairs)

extracted_relevant_tweets_pairs_counts = hashtag_pairs_counter(extracted_relevant_tweets)
filtered_stream_tweets_pairs_counts = hashtag_pairs_counter(filtered_stream_tweets)
eszters_tweets_words_pairs_counts = hashtag_pairs_counter(eszters_tweets_words)
eszters_tweets_hashtags_pairs_counts = hashtag_pairs_counter(eszters_tweets_hashtags)

print("10 most common hashtag pairs:\n")
print("US geolocated tweets (%s), filtered:" % len(extracted_relevant_tweets.index))
print(*extracted_relevant_tweets_pairs_counts.most_common(10), sep="\n")
print("\n")
print("Filtered stream tweets (%s):" % len(filtered_stream_tweets.index))
print(*filtered_stream_tweets_pairs_counts.most_common(10), sep="\n")
print("\n")
print("Eszters Tweets containing Uber related words (%s):\n" % len(eszters_tweets_words.index))
print(*eszters_tweets_words_pairs_counts.most_common(10), sep="\n")
print("\n")
print("Eszters Tweets containing Uber related hashtags (%s):\n" % len(eszters_tweets_hashtags.index))
print(*eszters_tweets_hashtags_pairs_counts.most_common(10), sep="\n")

## Most popular Uber-related tweets based on retweet counts

In [None]:
!pip3 install twint

In [136]:
import twint

c = twint.Config()

# Find most popular tweets based on retweet counts
def get_retweet_count(df):
    for index, data in df.iterrows():
        tweet_id = data[1]
        print(tweet_id)
        
        
get_retweet_count(extracted_relevant_tweets)

1.2486269261906043e+18
1.2486485884413993e+18
1.2486737384864197e+18
1.248674391111553e+18
1.248674626680623e+18
1.2486762650428088e+18
1.248677799759659e+18
1.248678013820166e+18
1.248678178924761e+18
1.2486786797539533e+18
1.248681921359614e+18
1.2486850277366538e+18
1.2486907211979653e+18
1.2486933931332076e+18
1.2486934042189169e+18
1.248694219423834e+18
1.2486971352493998e+18
1.2486972517168292e+18
1.2486995813757133e+18
1.2486998205518316e+18
1.248704702574809e+18
1.2487147862186107e+18
1.2489263143518413e+18
1.2489571176687452e+18
1.2489592716996936e+18
1.2489645206027428e+18
1.2489662874826424e+18
1.2489720921267036e+18
1.2489723596478464e+18
1.2489743012792033e+18
1.248975763774333e+18
1.2489804054343598e+18
1.248989533607248e+18
1.2489908236954214e+18
1.2489909848868004e+18
1.2489910649057157e+18
1.248991717455528e+18
1.2489920749611663e+18
1.2489930065413202e+18
1.2489946259578348e+18
1.2489973781175132e+18
1.2489993773072384e+18
1.249004473608962e+18
1.2490054900021658e+18
