In [2]:
import pandas as pd
import numpy as np
import json
import os
import multiprocessing as mp
from time import time
import socket
from timeit import default_timer as timer


import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,when,count,col,count,lit,sum
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame
from pyspark import SparkContext

# 1. Initialisation

In [4]:
memory = '10g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [5]:
try:
    spark
except NameError:
    print('Create Local SparkSession')
    spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

sc = spark.sparkContext

Create Local SparkSession


In [6]:
# Paths to data
path_to_data = "../data/"
path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF_departments/')
#path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF/')
path_to_external_data = os.path.join(path_to_data, "external-data/")

In [7]:
print('List files to be processed...')

fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_timeline))

paths=[file.getPath().toString() for file in list_status]
paths=[path.replace('hdfs://dumbo','') for path in paths if 'json' in path]
np.random.seed(0)
paths=np.random.permutation(sorted(paths))

print('# Files:', len(paths))

List files to be processed...
# Files: 425


In [8]:
n_chunks=10
print('# Chunks:', n_chunks)
paths_chunks=np.array_split(paths, n_chunks)

# Chunks: 10


# 2. Extract timelines

In [9]:
def extract_chunk(i_chunk,paths_chunk) :

    df = spark.read.json(list(paths_chunk))
    
    df=df.repartition(1000)
    
    df=df.select(
            'id_str',
            'created_at',
            'full_text',
            'lang',
            'user_id',
            'user_name',
            'city',
            'coordinates'
            )
    
    df = df.toDF(*[
            'id_str',
            'created_at',
            'full_text',
            'lang',
            'user_id',
            'user_name',
            'city',
            'coordinates'
            ])

    df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))
    df = df.filter(col('created_at') > '2019-12-01') 
   
    df = df.dropDuplicates()
    
    df.write.mode("overwrite").parquet(os.path.join(path_to_data,'chunks','IDF_departments',str(i_chunk)))

    return df

In [10]:
for i_chunk,paths_chunk in enumerate(paths_chunks):
    
    print('EXTRACT CHUNK', i_chunk)
    start = timer()

    extract_chunk(i_chunk,paths_chunk)

    end = timer()
    print('TIME:', round(end - start), 'SEC')
    

EXTRACT CHUNK 0
TIME: 231 SEC
EXTRACT CHUNK 1
TIME: 146 SEC
EXTRACT CHUNK 3
TIME: 139 SEC
EXTRACT CHUNK 4
TIME: 141 SEC
EXTRACT CHUNK 5
TIME: 142 SEC
EXTRACT CHUNK 6
TIME: 143 SEC
EXTRACT CHUNK 7
TIME: 145 SEC
EXTRACT CHUNK 8
TIME: 146 SEC
EXTRACT CHUNK 9
TIME: 140 SEC


In [10]:
# print('IMPORT')
# start=timer()
# timelines=spark.read.parquet(os.path.join(path_to_data,'chunks','IDF_departments','*/*.parquet'))
# end=timer()
# print('TIME ', round(end-start), 'SEC')