In [1]:
import pandas as pd
import json
import sys
import os
from url_parser import *
sys.path.append('/Volumes/GoogleDrive/Mon Drive/Python-helpers')
from save_obj import *
import subprocess

Most of the urls shared on twitter have the format 't.co/'. The problem is that two urls with this format may lead to the same url at the end. Therefore, this format does not allow us to spot users that share the same url.  

In order to retrieve the end urls, we will need to fetch the tweet_id associated with each tweet that has a t.co/ url. 

Then, we will use twarc to fetch each tweet corresponding to the tweet_id list we have. By fetching the tweet, we obtain a json file which contain the expanded url. 

In [2]:
%%time
if not os.path.isfile("data/df_url.csv"):
    data_manager = DataManager()
    data_manager.create_url_df()
    
df = pd.read_csv("data/df_url.csv", encoding='utf-8', engine='python')
df.dropna(inplace=True)
df = df.drop(columns=["author_id"])
df["natural_key"] = df.natural_key.astype(int)

CPU times: user 1min 19s, sys: 12.3 s, total: 1min 32s
Wall time: 1min 48s


In [3]:
# parse the body to get the urls
df["urls"] = df["body"].map(get_urls_from_text)

In [11]:
df.shape

(5154253, 3)

In [12]:
df.head()

Unnamed: 0,natural_key,body,urls
0,1104750238932365313,"""Mon gilet jaune, je l'ai brûlé"" : engagés de ...",[https://t.co/ixm2EDsStN]
2,1128316229297557504,Pour ceux qui ne croient pas aux mouvements de...,[https://t.co/oicTarPgUJ]
3,1132242959183880192,"Gilets Jaunes, une répression d'Etat | Documen...",[https://t.co/neGn8yleNq]
4,1150334645160095744,"Mdr, quelqu'un peut leur dire que c'est hasbee...",[https://t.co/tG2KyoGD5j]
5,1170185247876636678,"Tarbes : les lycéens dans la rue, solidaires d...",[https://t.co/1mjkPwlici]


In [8]:
# keep only the tweet that contains at least one url
df = df[df['urls'].map(len) > 0]

In [13]:
df.shape

(5154253, 3)

In [14]:
# We want one url on each row, with the corresponding tweet id. 
# First we take only the rows where there is only one url
df_1 = df[df['urls'].map(len) == 1][['natural_key', 'urls']].copy()
df_1['urls'] = df_1['urls'].map(lambda x: x[0])
df_1.rename(columns={'urls':'url'}, inplace=True)

In [15]:
# Now we take the rest
df_2 = df[df['urls'].map(len) > 1][['natural_key', 'urls']].copy()

In [16]:
assert(df_1.shape[0]+df_2.shape[0]==df.shape[0])

In [17]:
%%time
# For all tweets that contains more than one url, flatten the list of urls to have 
# only one url per line. We duplicate the corresponding tweet id for all corresponding
# lines
df_2 = (df_2.set_index(['natural_key'])['urls'] # take column 'urls' as Series 
                                                # with index 'natural_key'
            .apply(pd.Series) # transform to dataframe, for each row, take the 
                              # list in column 'urls' and put it on different
                              # columns ('0', '1', '2' if 3 elements in list)
                              # Fill with Nan in empty cells
            .stack() # transform to Series with kind of hierarchical index. 
                     # the 1st is 'natural_key', the second (subgroup) is 
                     # the column index '0', '1', '2'...
            .reset_index(level=1, drop=True) # drop subgroup index, duplicate
                                             # 1st index 'natural_key' for each 
                                             # row in this group
            .reset_index() # recreate dataframe with new index 
                           # and with column 'natural_key' and '0'
            .rename(columns={0:'url'})) # rename column '0' to 'url'

CPU times: user 1min 21s, sys: 757 ms, total: 1min 21s
Wall time: 1min 19s


In [18]:
df = pd.concat([df_1, df_2])

In [19]:
df.head()

Unnamed: 0,natural_key,url
0,1104750238932365313,https://t.co/ixm2EDsStN
2,1128316229297557504,https://t.co/oicTarPgUJ
3,1132242959183880192,https://t.co/neGn8yleNq
4,1150334645160095744,https://t.co/tG2KyoGD5j
5,1170185247876636678,https://t.co/1mjkPwlici


In [20]:
%%time
# Now we only need one tweet id per url
df_ids = (df.groupby('url')['natural_key']
                                .apply(lambda x: list(x)[0])
                                .reset_index(name='tweet_id'))

CPU times: user 6min 59s, sys: 2.28 s, total: 7min 1s
Wall time: 7min 2s


In [21]:
# Get the list of unique tweet id
ids = list(df_ids.tweet_id.unique())
len(ids)

3646789

In [11]:
# Slice the ids in different files to keep intermediate results in case of crash. 
for i in range(8):
    with open("ids" + str(i) + ".txt", "w") as f:
        if i == 7:
            for x in ids[7*500000:]:
                f.write(x + "\n")
        else:
            for x in ids[i*500000:(i+1)*500000]:
                f.write(x + "\n")

In [None]:
# We then use twarc to hydrate the tweet id and get the corresponding tweet with 
# all informations, including the expanded urls we are interested in.

# twarc hydrate ids0.txt > tweets0.jsonl
# same for 0 to 7

In [88]:
# now we read the json files. Each line represents a tweet in json format, which
# can be converted in nested python dict. We are looking for url elements which appear
# along with expanded_url (or simply 'expanded') element. 
# We build a dict mapping each url to the corresponding expanded_url. 
url_to_expanded = dict()

def parse_url_json(e):
    if type(e) is dict:
        if "url" in e and "expanded_url" in e:
            url_to_expanded[e["url"]] = e["expanded_url"]
        elif "url" in e and "expanded" in e:
            url_to_expanded[e["url"]] = e["expanded"]
        for x in e:
            #print(str(x) + "  ("+ str(type(e[x])) +")")
            parse_url_json(e[x]) 
    elif type(e) is list:
        for x in e:
            if type(x) is dict:
                parse_url_json(x)

In [90]:
len(url_to_expanded)

3623356

In [89]:
%%time
j = 0
for idx in list(range(8)):
    print(idx)
    for line in open('tweets' + str(idx) + '.jsonl', 'r'):
        if j%100000 == 0:
            print(j)
        jfile = json.loads(line)
        parse_url_json(jfile)
        j+=1

0
0
100000
200000
300000
400000
1
500000
600000
700000
800000
2
900000
1000000
1100000
1200000
1300000
3
1400000
1500000
1600000
1700000
4
1800000
1900000
2000000
2100000
2200000
5
2300000
2400000
2500000
2600000
6
2700000
2800000
2900000
3000000
3100000
7
3200000
3300000
3400000
CPU times: user 10min 2s, sys: 1min 7s, total: 11min 10s
Wall time: 13min 49s


In [91]:
# DONT USE THIS : the recursive function is better
if False:
    #url_to_expanded = dict()
    j = 0
    for idx in list(range(8)):
        print(idx)
        for line in open('tweets' + str(idx) + '.jsonl', 'r'):
            if j%100000 == 0:
                print(j)
            jfile = json.loads(line)

            if "entities" in jfile and "urls" in jfile["entities"]:
                urls = jfile["entities"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if ("entities" in jfile and 
                "url" in jfile["entities"] and 
                "urls" in jfile["entities"]["url"]):
                urls = jfile["entities"]["url"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if "entities" in jfile and "media" in jfile["entities"]:
                media = jfile["entities"]["media"]
                if len(media) > 0:
                    for x in media:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if "extended_entities" in jfile and "media" in jfile["extended_entities"]:
                media = jfile["extended_entities"]["media"]
                if len(media) > 0:
                    for x in media:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if ("user" in jfile and 
                "entities" in jfile["user"] and 
                "urls" in jfile["user"]["entities"]):
                urls = jfile["user"]["entities"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if ("user" in jfile and 
                "entities" in jfile["user"] and 
                "description" in jfile["user"]["entities"] and 
                "urls" in jfile["user"]["entities"]["description"]):
                urls = jfile["user"]["entities"]["description"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if ("user" in jfile and 
                "entities" in jfile["user"] and 
                "url" in jfile["user"]["entities"] and 
                "urls" in jfile["user"]["entities"]["url"]):
                urls = jfile["user"]["entities"]["url"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]

            if "quoted_status_permalink" in jfile:
                elem = jfile["quoted_status_permalink"]
                if elem is not None: 
                    url_to_expanded[elem["url"]] = elem["expanded"]

            if ("quoted_status" in jfile and 
                "entities" in jfile["quoted_status"] and 
                "urls" in jfile["quoted_status"]["entities"]):
                urls = jfile["quoted_status"]["entities"]["urls"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]
            if ("quoted_status" in jfile and 
                "entities" in jfile["quoted_status"] and 
                "media" in jfile["quoted_status"]["entities"]):
                urls = jfile["quoted_status"]["entities"]["media"]
                if len(urls) > 0:
                    for x in urls:
                        url_to_expanded[x["url"]] = x["expanded_url"]
            j+=1

In [92]:
url_to_expanded['https://t.co/TH66pjd3ts']

'http://sur.laprovence.com/cUP-f'

In [93]:
url_to_expanded["https://t.co/V7tB2dqoZJ"]

'https://twitter.com/snae_fr/status/1089606516540391424/video/1'

In [95]:
save_obj(url_to_expanded, "data/", "url_to_expanded")
#url_to_expanded = load_obj("", "url_to_expanded")