In [1]:
import pandas as pd
import numpy as np
import json
import os
import multiprocessing as mp
from time import time
import socket
from pathlib import Path
import re
import unicodedata
import sys

import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame
from pyspark import SparkContext

# 0. Initiatilisation

In [3]:
memory = '10g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [4]:
try:
    spark
except NameError:
    print('Create Local SparkSession')
    spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

sc = spark.sparkContext

Create Local SparkSession


In [5]:
# Paths to data
path_to_data = "../data/"
path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF_departments/')
#path_to_timeline=os.path.join(path_to_data,'timelines/API/IDF/')
path_to_external_data = os.path.join(path_to_data, "external-data/")

In [6]:
path_to_parquets = os.path.join(path_to_data,'chunks','IDF-departments-to-analyze')
parquet_files = sorted([x for x in Path(path_to_parquets).glob("**/*.parquet")])

In [7]:
print('List files to be processed...')

fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_parquets))

paths=[file.getPath().toString() for file in list_status]
#paths=[path.replace('hdfs://dumbo','') for path in paths if 'json' in path]
np.random.seed(0)
paths=np.random.permutation(sorted(parquet_files))

print('# Files:', len(paths))

List files to be processed...
# Files: 16


In [8]:
df=spark.read.option("encoding", "UTF-8").parquet(os.path.join(path_to_data,'chunks','IDF-departments-to-analyze'))

KeyboardInterrupt: 

In [None]:
df.count()

In [None]:
tweets=df

# 1. Cleaning dataset

In [None]:
# UNIDECODE : remove accents
def make_trans():
    matching_string = ""
    replace_string = ""

    for i in range(ord(" "), sys.maxunicode):
        name = unicodedata.name(chr(i), "")
        if "WITH" in name:
            try:
                base = unicodedata.lookup(name.split(" WITH")[0])
                matching_string += chr(i)
                replace_string += base
            except KeyError:
                pass

    return matching_string, replace_string

def clean_text(c):
    matching_string, replace_string = make_trans()
    return translate(
        regexp_replace(c, "\p{M}", ""), 
        matching_string, replace_string
    ).alias(c)

In [None]:
def clean_dataset(df):
    
    df=df.select(
            date_format(col('created_at'),"yyyy-MM-dd").alias('day').cast("date"),
            lower(col('full_text')).alias('full_text'),
            'lang'
            )

    # remove rt
    df = df.filter(~ df.full_text.startswith('rt'))
   
    # remove user ids and urls
    df = df.withColumn('full_text', regexp_replace('full_text', r'@[A-Za-z0-9-_]+','@mention'))
    df = df.withColumn('full_text', regexp_replace('full_text', 'https?://[A-Za-z0-9./]+','<url>'))
    
    # language : french
    df = df.filter(df.lang=='fr')
    
    return df

In [None]:
tweets = clean_dataset(tweets)

tweets = tweets.select('day', clean_text('full_text'))
tweets.repartition(10000)


# 2. Excess Frequency

## 2.1. Tokenization of words - trigrams

In [None]:
# Cleaning: remove @mention [url] # and punctuation-smileys
tweets = tweets.withColumn('full_text', regexp_replace('full_text','@mention', ''))
tweets = tweets.withColumn('full_text', regexp_replace('full_text', '<url>', ''))
tweets = tweets.withColumn('full_text', regexp_replace('full_text', '#', ''))
tweets = tweets.withColumn('full_text', regexp_replace('full_text', '[^\sa-zA-Z0-9]', ''))

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, NGram, CountVectorizer, HashingTF

# Tokenize
tokenizer = RegexTokenizer(inputCol="full_text", outputCol="tokens", minTokenLength=1)
tweets = tokenizer.transform(tweets)

# ngrams
two_gram = NGram(n=2, inputCol="tokens", outputCol="2-grams")
three_gram = NGram(n=3, inputCol="tokens", outputCol="3-grams")
tweets = two_gram.transform(tweets)
tweets = three_gram.transform(tweets)

In [None]:
# Concatenate tokens, bigrams, trigrams
token_counts = tweets.select('day', concat(col("tokens"), col("2-grams"), col('3-grams')).alias('tokens'))

## 2.2. Get excess frequency

In [None]:
# Compute frequency
token_counts = token_counts.withColumn("tokens", explode("tokens"))\
                .groupBy(['day',"tokens"]).agg(count("tokens").alias('counts'))

In [None]:
# Excess frequency
token_mean_before = token_counts.filter(token_counts.day<'2020-02-15')\
                                .groupby('tokens').agg(mean('counts').alias('mean_before'))

token_mean_after = token_counts.filter(token_counts.day>'2020-02-15').filter(token_counts.day<'2020-05-15')\
                                .groupby('tokens').agg(mean('counts').alias('mean_after'))
token_std = token_counts.groupby('tokens').agg(stddev('counts').alias('std'))

In [None]:
token_mean = token_mean_before.join(token_mean_after, on='tokens', how='outer')
token_mean = token_mean.join(token_std, on='tokens', how='outer')

In [None]:
token_mean = token_mean.select('tokens', 'mean_before', 'mean_after', (token_mean.mean_before* 2*token_mean.std).alias('threshold'))
token_mean_selected = token_mean.filter(token_mean.threshold!=0).filter(token_mean.mean_after>token_mean.threshold)

In [None]:
token_mean = token_mean.select('tokens','mean_before','mean_after', 
                              (log10(token_mean.mean_after/token_mean.mean_before)).alias('log'))

In [None]:
token_mean.orderBy('log', ascending = False).show(n=500)

In [None]:
token_mean.filter(token_mean.tokens=='fievre').show(n=100)

In [None]:
#token_mean_selected.show(n=100)

In [None]:
tweets.filter(tweets.full_text.contains('chat perdu')).show(n=3000)