In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from db import *
from time_series import *
import sys
from url_parser import *
sys.path.append('/Volumes/GoogleDrive/Mon Drive/Python-helpers')
from save_obj import *

# Urls

We use author_id, natural_key, and body (url is the url of the tweet so we ignore it)

In [2]:
%%time
if not os.path.isfile("data/df_url.csv"):
    data_manager = DataManager()
    data_manager.create_url_df()
    
df_url = pd.read_csv("data/df_url.csv", encoding='utf-8', engine='python')
df_url.dropna(inplace=True)
df_url = df_url.drop(columns=["natural_key"])
df_url["author_id"] = df_url.author_id.astype(int)

CPU times: user 1min 8s, sys: 9.53 s, total: 1min 17s
Wall time: 1min 20s


In [3]:
df_url.head()

Unnamed: 0,author_id,body
0,374784294,"""Mon gilet jaune, je l'ai brûlé"" : engagés de ..."
1,1053327852425433088,"@EMierti @Rine02_22 C'est impressionnant, tous..."
2,1111305741003079680,Pour ceux qui ne croient pas aux mouvements de...
3,1417988923,"Gilets Jaunes, une répression d'Etat | Documen..."
4,1926002202,"Mdr, quelqu'un peut leur dire que c'est hasbee..."


The first thing to do is to parse the body to get the urls

In [5]:
%%time
# parse the body to get the urls
df_url["urls"] = df_url["body"].map(get_urls_from_text)
# keep only the tweet that contains at least one url
df_url = df_url[df_url['urls'].map(len) > 0]

CPU times: user 3min 51s, sys: 699 ms, total: 3min 52s
Wall time: 3min 52s


In [6]:
# Load the dict mapping urls to expanded urls
url_to_expanded = load_obj("data/", "url_to_expanded")

In [7]:
def urls_expanded(urls):
    res = []
    for x in urls:
        res.append(url_to_expanded[x] if x in url_to_expanded else x)
    return res


In [8]:
%%time
df_url["urls"] = df_url["urls"].map(urls_expanded)

CPU times: user 11.5 s, sys: 94.7 ms, total: 11.5 s
Wall time: 11.5 s


In [9]:
df_url.head()

Unnamed: 0,author_id,body,urls
0,374784294,"""Mon gilet jaune, je l'ai brûlé"" : engagés de ...",[https://www.francetvinfo.fr/economie/transpor...
2,1111305741003079680,Pour ceux qui ne croient pas aux mouvements de...,[https://youtu.be/XEy-xaGZZb4]
3,1417988923,"Gilets Jaunes, une répression d'Etat | Documen...",[https://youtu.be/3MjuoDpKLfI]
4,1926002202,"Mdr, quelqu'un peut leur dire que c'est hasbee...",[https://twitter.com/myMetropolitain/status/11...
5,2289833964,"Tarbes : les lycéens dans la rue, solidaires d...",[https://youtu.be/Ibnwyr86bPU]


In [16]:
test = [y for x in df_url["urls"] for y in x]

In [17]:
len(test)

5562947

In [18]:
len(set(test))

2322787

In [19]:
test2 = [x for x in test if "https://t.co/" in x]

In [20]:
len(test2)

675325

In [21]:
len(set(test2))

459556

In [22]:
# Some urls cannot be expanded, this is because we cannot retrieve all tweets, 
# for example in case the author was suspended, or if the user tweeted more than 3200 tweets. 

In [None]:
# INPUT
# list of urls contained in each tweet
# OUTPUT
# each pair of users that tweet the same url, along with the count of same urls shared.

# 1) For each single url, get the users sharing it

In [28]:
url_t = [x for x in test if "https://twitter.com/" in x]

In [30]:
len(set(test))

2322787

In [31]:
len(set(url_t))

1129259

In [32]:
1129259/2322787

0.4861655416531951

In [25]:
urls_count = pd.Series(test).value_counts()

In [33]:
urls_count[:22]

https://twitter.com/CNEWS/status/1063743985322270720              9424
https://twitter.com/VictorLefranc/status/1068875092409630720      6332
https://twitter.com/BFMTV/status/1067914481798397957              5043
https://twitter.com/InfosFrancaises/status/1071864117407948806    3842
https://twitter.com/GuillaumeAuda/status/1068929173513822212      3765
@Ledauphine.com                                                   3371
https://twitter.com/BFMTV/status/1068469827503906816              3083
https://twitter.com/LarryLeChanceux/status/1071087749996994560    2937
https://twitter.com/charlie_mouton/status/1066505531043516416     2921
http://ladepeche.fr                                               2453
https://twitter.com/leJDD/status/1071543445171257344              2224
https://amnistiegj.fr/                                            2060
https://twitter.com/thomas_guenole/status/1068867651747889152     1987
https://twitter.com/Brevesdepresse/status/1071545349133819905     1985
https:

In [24]:
%%time
# For all tweets that contains more than one url, flatten the list of urls to have 
# only one url per line. We duplicate the corresponding tweet id for all corresponding
# lines
# First we take only the rows where there is only one url
df_1 = df_url[df_url['urls'].map(len) == 1][['author_id', 'urls']].copy()
df_1['urls'] = df_1['urls'].map(lambda x: x[0])
df_1.rename(columns={'urls':'url'}, inplace=True)

CPU times: user 8.29 s, sys: 5.43 s, total: 13.7 s
Wall time: 17 s


In [25]:
%%time
# Now we take the rest
df_2 = df_url[df_url['urls'].map(len) > 1][['author_id', 'urls']].copy()
df_2 = (df_2[['author_id', 'urls']].set_index(['author_id'])['urls'] 
                                  .apply(pd.Series)
                                  .stack() 
                                  .reset_index(level=1, drop=True)
                                  .reset_index()
                                  .rename(columns={0:'url'}))

In [27]:
df = pd.concat([df_1, df_2])

In [34]:
%%time
# For each url, compute the list of authors who used it
authors_for_url = (df.groupby('url')['author_id']
                     .apply(list)
                     .reset_index(name='authors_id'))

CPU times: user 4min 33s, sys: 4.57 s, total: 4min 38s
Wall time: 4min 40s


In [35]:
authors_for_url['authors_id'] = authors_for_url['authors_id'].apply(lambda x: list(set(x)))

In [36]:
%%time
authors_for_url['authors_count'] = authors_for_url['authors_id'].str.len()
authors_for_url.sort_values(by='authors_count', ascending=False, inplace=True)#.drop(columns='len')

CPU times: user 2.81 s, sys: 196 ms, total: 3 s
Wall time: 1.96 s


In [38]:
authors_for_url.head()

Unnamed: 0,url,authors_id,authors_count
1545675,https://twitter.com/VictorLefranc/status/10688...,"[912043131901706240, 951373816474763264, 10688...",5510
994288,https://twitter.com/CNEWS/status/1063743985322...,"[785559760355622912, 714895620985716736, 75484...",4659
1162654,https://twitter.com/GuillaumeAuda/status/10689...,"[707306956705439744, 825689080616869888, 85906...",3462
946746,https://twitter.com/BFMTV/status/1067914481798...,"[709361509969879040, 1027564325412716544, 7161...",1701
256957,https://amnistiegj.fr/,"[702751630601625600, 797840397414318080, 10877...",1593


In [52]:
test = df.groupby('author_id').size()

In [64]:
(test>20).sum()

36649

In [40]:
%%time
# For each author, compute the list of urls used
urls_for_author = (df.groupby('author_id')['url']
                     .apply(set)
                     .reset_index(name='urls_set'))
# Add a column 'urls_set_count' representing the size of each url set
urls_for_author['urls_set_count'] = urls_for_author['urls_set'].str.len()
# Sort by the size of hashtag set
urls_for_author.sort_values(by='urls_set_count', 
                                ascending=False, 
                                inplace=True)
# Compute the hash of each hashtag set to allow groupby
urls_for_author['urls_set_hash'] = urls_for_author.urls_set.apply(
                                                                lambda x: hash(str(x)))

CPU times: user 50.7 s, sys: 1.92 s, total: 52.6 s
Wall time: 53.4 s


In [41]:
urls_for_author.head()

Unnamed: 0,author_id,urls_set,urls_set_count,urls_set_hash
33540,133663801,{https://twitter.com/BFMTV/status/109627817633...,13278,-5172840692417932696
329886,996497906713726720,{http://lesinfos.online/2018/11/24/gilets-jaun...,10557,-7385318961965236704
9797,38395124,{https://twitter.com/franceinfo/status/1089822...,7437,-5352477132093788639
9734,38142665,"{https://l.leparisien.fr/psW-6, https://l.lepa...",6895,-588068919427934230
85261,468507589,"{https://t.co/tthVjk0Jvq, https://t.co/COPqsS8...",6157,-5134498860848485581


In [51]:
len(urls_for_author.author_id.unique())

387883

In [42]:
# For each set of urls used, compute the list of authors who used it
authors_set = (urls_for_author.groupby('urls_set_hash')['author_id']
                                  .apply(lambda x: set(x))
                                  .reset_index(name='author_id_set'))
# Add a column 'author_id_set_count' representing the size of each author set
authors_set['author_id_set_count'] = authors_set['author_id_set'].str.len()
# Sort by the size of author set
authors_set.sort_values(by='author_id_set_count', ascending=False, inplace=True)
# Filter all urls used by one author only
authors_set = authors_set[authors_set.author_id_set_count>1]

In [43]:
# Merge to retrieve the set of urls corresponding to the hash. 
hash_to_urls = urls_for_author.drop_duplicates(subset=["urls_set_hash"])
hash_to_urls = hash_to_urls[["urls_set_hash", 
                             "urls_set", 
                             "urls_set_count"]]
authors_set = authors_set.merge(hash_to_urls, on="urls_set_hash", how="left")

In [86]:
test = authors_set[(authors_set["urls_set_count"] > 1) & 
            (authors_set["author_id_set_count"] > 4)]

In [87]:
test

Unnamed: 0,urls_set_hash,author_id_set,author_id_set_count,urls_set,urls_set_count
97,-4133695708687242444,"{931565897289957120, 779396027229020160, 71198...",68,{https://twitter.com/VictorLefranc/status/1068...,2
179,-5372721623884452434,"{835277109849370624, 707259961357762560, 82568...",43,{https://twitter.com/VictorLefranc/status/1068...,2
293,7853993123539181914,"{1077198291429924864, 1019478813175156736, 101...",30,"{http://ladepeche.fr, https://www.ladepeche.fr...",2
298,6477017253810424481,"{938499781109919744, 870752188095713280, 87907...",29,{https://twitter.com/GuillaumeAuda/status/1068...,2
454,-7707381570413551546,"{934146292275732480, 834808210930429952, 90785...",21,"{http://youtu.be/9i3alzuVFXo?a, http://youtu.b...",2
456,-6961154640218309614,"{841748048191578112, 2303806208, 1086656088131...",21,{https://www.mesopinions.com/petition/animaux/...,2
479,-4853820906390783791,"{923251084558307200, 781860180631969920, 31153...",20,{https://twitter.com/CNEWS/status/106374398532...,2
623,-8207830610367470059,"{777923222738177920, 925498087849816064, 75803...",16,{https://twitter.com/VictorLefranc/status/1068...,2
693,-5399691998635364029,"{242890560, 754810499364184064, 280118722, 101...",15,{https://twitter.com/VictorLefranc/status/1068...,2
713,28915667345456289,"{935723103929368320, 2992508992, 8018501526087...",14,{https://twitter.com/CNEWS/status/106374398532...,2


In [88]:
len(set([y for x in test["author_id_set"] for y in x]))

628

In [79]:
candidates = list(test["author_id_set"].apply(list))
save_obj(candidates, "candidates/", "candidates_url")