# Tweets Extraction

In [1]:
import twitter
from tqdm import tqdm
import numpy as np
import pandas as pd

## API settup

In [2]:
api = twitter.Api(
    consumer_key = '',
    consumer_secret = '',
    access_token_key = '',
    access_token_secret = '',
    sleep_on_rate_limit = True
)

## Original screen names

In [3]:
# https://www.youtube.com/watch?v=b9FHAuO65aw&t=2054s

screen_names = [
    'BarneriasPierre',    # real of the movie
    'holdup_ledoc',       # twitter account of the movie
    'silvano_trotta',     # anti-vax, Lune est creuse
    'ChroniLyme',         # Christian Perronne, maladie de lyme
    'xazalbert',          # Xavier Azalbert, home d'affaire pro-cloroquine, a un média
    'CorinneReverbel',    # Elle a relayé de nombreux autres dans la liste
    'ViolaineGuerin',     # pro-hydro 
    'MartineWonner',      # députée, masque ne sert à rien
    'TrottaDr',           # pro-raoult, médecine alternative
    'RaderSerge',         # anti-vax
    'Stuckelberger',      # spécialiste du viellissement, pas de deuxième vague
    'EmaKrusi',           # a interviewé nombreux d'entre eux
    'laancelot',          # Michel Rozensweig, écrivain pour nexus, magasine de désinformation
    'MagazineNexus',      # gros magzine de désinformation, beaucoup on écrit dedans
    'PINCON_CHARL0T'      # sociologue, extrême gauche, extermination des pauvres via réchauffement climatique
]

names = [
    'Luc montagnier',            # no twitter account, nobel prize, says COVID is handmade
    'jean-bernard fourtillan',   # no twitter account, anti-vax, Dieu lui a révélé 
    'alexandra henrion-caude',   # Twitter account in english, amourons
    'Lauran Toubiana',           # no twitter account,Astrophysicien, mais considéré comme épydémiologiste, pas de 2nd vague
    'Edouard Broussalian',       # Twitter account in english, deseases don't exist
    'Valérie Bugault',           # no Twitter account, conspirationiste
    'Michael Levitt',            # Twitter account in english
    'Miguel Barthelery',         # no Twitter account, les virus sont bénéfiques
    'Olivier Vuillemin'          # contre 5G
]

### Fetches the people that are followed by the original screen names

In [20]:
def get_common_friends(api, screen_names):
    """
    returns a dictionary of all friends (people followed by) of all given screen_names,
    and count their occurences
    
    Parameters
    ----------
        api : python.twitter.Api object
            
        screen_names : list of String
            the screen names

    Returns
    -------
        d : dictionary, keys are the ids of the friends and values are the number of
            time they occurred
    """
    d = {}

    for screen_name in tqdm(screen_names):
        for id_ in api.GetFriendIDs(screen_name = screen_name):
            if id_ not in d.keys():
                d[id_] = 1
            else:
                d[id_] = d[id_] + 1
            
    return d

def screen_names_to_ids(api, screen_names):
    """
    Transforms 
    """
    res = []
    for screen_name in screen_names:
        id_ = api.GetUser(screen_name = screen_name).id
        res.append(id_)
    return res

def get_tweets(api, user_ids):
    timelines = []
    for id_ in tqdm(user_ids):
        # just to pass the first while
        day = 23
        month = 'Nov'
        max_id = None
        tls = []
        # as long as we are between now and when the movie came out, we keep getting older tweets
        while(day > 9 and month == 'Nov'):
            timeline = api.GetUserTimeline(user_id = id_, count = 200, max_id = max_id)
            # We look at the oldest and update the values
            last = timeline[-1]
            date = last.created_at.split(' ')
            day, month = int(date[2]), date[1]
            max_id = last.id
            # We add the timeline, omitting the last to avoid duplicates
            tls.append([x.id for x in timeline[:-1]])

        timelines.append(np.concatenate(tls))
    
    return np.concatenate(timelines)

def get_mentions_and_hashtags(api, statuses_ids):
    mentions_dict = {}
    hashtags_dict = {}

    for status in statuses:
        mentions = status.user_mentions
        hashtags = status.hashtags

        for m in mentions:
            name = m.screen_name
            if name in mentions_dict.keys():
                mentions_dict[name] = mentions_dict[name] + 1
            else:
                mentions_dict[name] = 1

        for h in hashtags:
            txt = h.text
            if txt in hashtags_dict.keys():
                hashtags_dict[txt] = hashtags_dict[txt] + 1
            else:
                hashtags_dict[txt] = 1
        
    return mentions_dict, hashtags_dict

In [5]:
ids = get_common_friends(api, screen_names)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  6.18it/s]


### Take only those that are followed by at least 5 of the original screen names

In [6]:
## take only followed by 5 or more
res = []
for (id_, n) in ids.items():
    if n >= 5:
        res.append(id_)

### Filter out non-french

In [7]:
## take only those in french
french_user_ids = []
for id_ in res:
    usr = api.GetUser(user_id = id_)
    if usr.status.lang == 'fr' or usr.lang == 'fr':
        french_user_ids.append(id_)

### Add previous screen names ids and remove duplicates

In [9]:
## Add previous screen names ids and remove duplicates
friends = screen_names_to_ids(api, screen_names)
french_user_ids = np.concatenate((friends, french_user_ids))

In [10]:
french_user_ids = list(set(french_user_ids))

### Get a first pass of posts with the screen names

In [14]:
tweet_ids = get_tweets(api, french_user_ids)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:32<00:00,  1.12it/s]


In [17]:
len(tweet_ids)

17616

### Find hashtags and mentions among the first pass

In [19]:
statuses = api.GetStatuses(tweet_ids)

In [23]:
mentions_dict, hashtags_dict = get_mentions_and_hashtags(api, statuses)

In [24]:
mentions = []
for (m, n) in mentions_dict.items():
    if n > 50:
        mentions.append(m)
len(mentions)

42

In [25]:
mentions

['raoult_didier',
 'ViolaineGuerin',
 'CorinneReverbel',
 'momotchiii',
 'EChabriere',
 'biobiobiobioc',
 'ArtLeroux',
 'Laissonslespre1',
 'medicalfollower',
 'Stalec_',
 'france_soir',
 'DIVIZIO1',
 'AssoCovid',
 'MartineWonner',
 'olivierveran',
 'DocteurGonzo4',
 'holdup_ledoc',
 'IHU_Marseille',
 'CNEWS',
 'aragon_jb',
 'JeanCASTEX',
 'silvano_trotta',
 'Le___Doc',
 'ivanrioufol',
 'lemondefr',
 'MarianneleMag',
 'libe',
 'SudRadio',
 'VirusWar',
 'NicoleDelepine',
 'f_philippot',
 'OSTERElizabeth1',
 'andrebercoff',
 'ArianeWalter',
 'AnonymeCitoyen',
 'franceinfoplus',
 'Poulin2012',
 'JeanYvesCAPO',
 'QuackFighter',
 'wargonm',
 'noemieschulz',
 'LacombeKarine1']

In [26]:
hashtags = []
for (h, n) in hashtags_dict.items():
    if n > 20:
        hashtags.append(h)
len(hashtags)

36

### getting ids of mentions and removing duplicates

In [27]:
mentions_ids = screen_names_to_ids(api, mentions)

In [29]:
french_user_ids = np.concatenate((french_user_ids, mentions_ids))
french_user_ids = list(set(french_user_ids))

In [30]:
len(french_user_ids)

56

In [42]:
import pickle

with open('user_ids', 'wb') as f:
    pickle.dump(french_user_ids, f)

## Getting more general tweets from the HoldUp hashtag

In [203]:
day = 23
month = 'Nov'
max_id = None
tls = []
while(day > 9 and month == 'Nov'):
    tweets = api.GetSearch(term = 'holdup', count = 100, lang = 'fr', max_id = max_id)
    # We look at the oldest and update the values
    if len(last) > 0:
        last = tweets[-1]
        date = last.created_at.split(' ')
        day, month = int(date[2]), date[1]
        max_id = last.id
        # We add the timeline, omitting the last to avoid duplicates
        tls.append([x.id for x in tweets[:-1]])
    else:
        break

IndexError: list index out of range

In [None]:
tls