In [1]:
import pandas as pd
import numpy as np
import json
import os
import multiprocessing as mp
from time import time
import socket
from timeit import default_timer as timer


import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,when,count,col,count,lit,sum
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame
from pyspark import SparkContext

# 1. Initialisation

In [3]:
memory = '10g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [4]:
try:
    spark
except NameError:
    print('Create Local SparkSession')
    spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

sc = spark.sparkContext

Create Local SparkSession


In [5]:
# Paths to data
path_to_data = "../data/"
path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF_departments/')
#path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF/')
path_to_external_data = os.path.join(path_to_data, "external-data/")
path_to_parquets = os.path.join(path_to_data,'chunks','IDF_departments')

In [6]:
from pathlib import Path
parquet_files = sorted([x for x in Path(path_to_parquets).glob("**/*.parquet")])

In [7]:
def get_tweets(index_block,n_blocks):
    df=pd.DataFrame()
    for parquet_file in parquet_files[n_blocks*index_block:n_blocks*(index_block+1)]:
        #print(str(json_file))
        timeline=pd.read_parquet(parquet_file,engine='pyarrow')
        df = pd.concat([df,timeline])
    return df

In [8]:
# Parallelization
n_cpu = mp.cpu_count()
print('USE DEFAULT # CORES')
pool  = mp.Pool(processes=n_cpu)
print("# PROCESSORS:", n_cpu, "\n")

print("Read Tweets ...")
start = time()

# COMPUTE LIST OF RESULTS
n_blocks= len(parquet_files)//n_cpu + len(parquet_files)%n_cpu
print(n_blocks)
results = [pool.apply_async(get_tweets, args=(index_block,n_blocks)) for index_block in range(n_cpu)]
print(results)
tweets  = pd.concat([results[index_block].get() for index_block in range(n_cpu)]).reset_index(drop=True)
print('done')

pool.close()    
print("DONE IN", round(time() - start), "SEC")

USE DEFAULT # CORES
# PROCESSORS: 16 

Read Tweets ...
125
[<multiprocessing.pool.ApplyResult object at 0x7fe86a149a10>, <multiprocessing.pool.ApplyResult object at 0x7fe86a149cd0>, <multiprocessing.pool.ApplyResult object at 0x7fe86a149d90>, <multiprocessing.pool.ApplyResult object at 0x7fe86a149e50>, <multiprocessing.pool.ApplyResult object at 0x7fe86a149f10>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f090>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f150>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f210>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f2d0>, <multiprocessing.pool.ApplyResult object at 0x7fe86a149fd0>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f3d0>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f490>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f550>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f610>, <multiprocessing.pool.ApplyResult object at 0x7fe86a14f6d0>, <multiprocessing.pool.App

In [9]:
tweets = tweets.drop_duplicates(subset='id_str')

In [10]:
len(tweets)

10055191

In [12]:
len(tweets['user_id'].unique())

14130

# 2. Cleaning localisation

## 2.1. To geopandas

In [13]:
from shapely.geometry import Point, shape
from shapely.geometry.polygon import Polygon

import geopandas as gpd
from geopandas.tools import sjoin

In [14]:
# https://public.opendatasoft.com/explore/dataset/contours-geographiques-des-departements-2019/export/
departments = pd.read_csv('https://public.opendatasoft.com/explore/dataset/contours-geographiques-des-departements-2019/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B', sep=';')
departments = departments.rename(columns={'Nom du département (MAJUSCULE)' : 'Department name',
                               'Nom de la région (MAJUSCULE)' : 'Region name'})
departments = departments[['Geo Shape','Department name','Region name','Code INSEE Département','Code INSEE Région']]

In [15]:
# Departments - to geopandas
departments['Geo Shape'] = departments['Geo Shape'].apply(lambda x: json.loads(x))
departments['Geo Shape'] = departments['Geo Shape'].apply(lambda x: shape(x))
departments_geoshape = gpd.GeoDataFrame(departments).set_geometry('Geo Shape')

# Departments : compute area
departments_geoshape['area'] = departments_geoshape.area

In [16]:
# Regions
regions = departments_geoshape.dissolve(by='Region name').reset_index()
regions = regions[['Region name','Geo Shape','Code INSEE Région']]

## 2.2. Get location of each tweet (when possible)

To find the location of a tweet, we compute the area and the centroid of the bounding box related to the location. We merge the tweets df with the geopandas df related to the department data -- only for the tweets where the location is small enough (so that we know it is capturing cities and not regions or countries. And we look if the centroid is in the department / region.

In [17]:
# tweets to geopandas
tweets['coordinates'] = tweets.loc[tweets['coordinates'].notna(),'coordinates'].apply(lambda x : [item for sublist in x for item in sublist])
tweets['coordinates'] = tweets.loc[tweets['coordinates'].notna(), 'coordinates'].apply(lambda x: Polygon(x))

# tweets dep
tweets_geo = tweets.loc[~tweets['coordinates'].isna()]
tweets_geo = gpd.GeoDataFrame(tweets_geo).set_geometry('coordinates')
tweets_geo['area'] = tweets_geo.area
tweets_geo['centroid'] = tweets_geo.centroid
tweets_geo = gpd.GeoDataFrame(pd.DataFrame(tweets_geo)).set_geometry('centroid')

In [18]:
print('Joining with departments - centroid...')
tweets_dep = tweets_geo[tweets_geo['area'] < departments_geoshape['area'].mean()]
tweets_dep = sjoin(tweets_dep, departments_geoshape, how='inner', op='within')

print('Joining with regions...')
tweets_reg = tweets_geo.loc[(~tweets_geo['id_str'].isin(tweets_dep['id_str'].tolist())) & 
                            (tweets_geo['city'] == 'France')]
tweets_reg = sjoin(tweets_reg, regions, how='left', op='within')

Joining with departments - centroid...
Joining with regions...


In [19]:
tweets_final = pd.concat([tweets_dep, tweets_reg])
tweets_final = pd.concat([tweets_final, tweets[~tweets['id_str'].isin(tweets_final['id_str'].tolist())]]) \
                        .sort_values(by=['user_id','created_at'])

# 3. Extrapolate user's location

In [20]:
tweets_lockdown = tweets_final[tweets['created_at'] > '02-01-2020']

In [21]:
users = tweets_lockdown.groupby('user_id')['Department name'].agg({'value_counts'}).reset_index()
users = users.sort_values(by=['user_id','value_counts'], ascending=True) \
            .drop_duplicates(subset='user_id', keep='last') \
            .rename(columns = {'Department name' : 'Department'})

In [32]:
tweets_final = pd.merge(tweets_final, users, on='user_id', how='left')
tweets_final = pd.DataFrame(tweets_final, columns=['id_str','created_at','full_text','lang','user_id','user_name','city',
                                                   'Department','Code INSEE Département'])

In [24]:
path_to_parquets = os.path.join(path_to_data,'chunks','test')

In [38]:
n_chunks = mp.cpu_count()
n_blocks = len(tweets_final)//n_chunks + len(tweets_final)%n_chunks

for i_chunk in range (n_chunks):
    print('Write chunk ' + str(i_chunk))
    df = tweets_final[n_blocks*i_chunk:n_blocks*(i_chunk+1)]
    df.to_parquet(os.path.join(path_to_parquets, str(i_chunk)+ '.parquet'), engine='pyarrow')

Write chunk 0
Write chunk 1
Write chunk 2
Write chunk 3
Write chunk 4
Write chunk 5
Write chunk 6
Write chunk 7
Write chunk 8
Write chunk 9
