In [3]:
import glob
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from collections import Counter
from scipy.stats import chi2_contingency, kruskal
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

font = {'size'   : 45}

matplotlib.rc('font', **font)

In [4]:
def get_head_tail(df, mode='retweet_count', head_perc=0.95, tail_perc=0.5):
    """
    Given a dataframe `df` return the head and tail of the dataframe according to mode provided.
    Args:
        df (pd.DataFrame): Original dataframe
        mode (Optional[str], optional): Metric to be used to get head/tail. Possible values: `retweet_follower_ratio,
                                        retweets_in_relation_to_average, retweet_count`. Defaults to 'retweet_count'.
        head_perc (Optional[float], optional): Percentile threshold to consider entries in head. Defaults to `0.95`.
        tail_perc (Optional[float], optional): Percentile threshold to consider entries in tail. Defaults to `0.35`.
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: head, tail of the dataframe
    """
    top_threshold_retweets = df[mode].quantile(head_perc)
    bot_threshold_retweets = df[mode].quantile(tail_perc)
    idx_top_retweets = df[df[mode] >= top_threshold_retweets].index
    idx_bot_retweets = df[df[mode] <= bot_threshold_retweets].index

    head = df.loc[idx_top_retweets]
    tail = df.loc[idx_bot_retweets]

    return head, tail

In [5]:
def calc_in_relation_to_average(x) -> float:
    """
    Support function to calculate the "in_relation_to_average" statistic. i.e How much a tweet
    has been retweeted given the avg of the retweets of the user. E.g user's X retweet_count
    average is 100. One of his tweets is retweeted 80 times then this tweets
    "in_relation_to_average" score would be 80.
    Args:
        x (pd.Dataframe row): Row of a dataframe. Must contain "retweet_count", "avg_retweets"
                              columns.
    Returns:
        (float): The "in_relation_to_average" statistic.
    """
    if x['avg_retweets'] > 0:
        return (100 * x['retweet_count']) / x['avg_retweets']
    else:
        return 0

In [6]:
input_path = './data/main/*.pkl'
file_names = glob.glob(input_path)

df = []
for f in file_names:
    temp = pd.read_pickle(f)
    df.append(temp)
    
df = pd.concat(df)

# create month column
df['created_at'] = pd.to_datetime(df['created_at'],utc=True)
df['month'] = df['created_at'].dt.strftime('%m')
df['month'] = df['month'].astype(int)

# ignore entries with no tokens (most of them only @handles or urls)
before = len(df)
df = df[df.tokens.str.len() > 0]
df = df.reset_index()
after = len(df)
print(f"Ignoring {before - after} tweets with no content (handles/urls)\n")

# ignore doublications 
df = df.drop_duplicates(subset='id')
print(f"Ignoring {after - len(df)} duplicate tweets \n")

countries =  ['uk', 'wales', 'scotland', 'nireland', 'es', 'gr', 'catalan', 'basque']

UnpicklingError: invalid load key, ','.

In [4]:
import requests

API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/xlm-twitter-politics-sentiment"
headers = {"Authorization": f"Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "I like you. I love you",
})