In [1]:
import json
import pandas as pd
import numpy as np
from zipfile import ZipFile 
import os
import datetime
from random import randint
import matplotlib.colors as mc
import colorsys
from matplotlib import pyplot as plt
import matplotlib.animation as animation
import re
from matplotlib import ticker

DIR = "messages"
USER_NAME = ""
FILE = "*.zip"

In [2]:
#Extraction des fichier .JSON

if not "messages" in os.listdir():
    os.mkdir(DIR)

# ouvrir le fichier zip
zipf = ZipFile(FILE) 
liste = zipf.infolist()
for zipinfo in liste:
    if "inbox" in zipinfo.filename:
        name = zipinfo.filename[15:zipinfo.filename.find("_")]
        if not "/" in name:
            if zipinfo.filename.endswith(".json"):
                zipinfo.filename = zipinfo.filename[zipinfo.filename.find("message_"):]
                full_path = DIR+"/"+name
                if not name in os.listdir(DIR+"/"):
                    os.mkdir(full_path)
                zipf.extract(zipinfo, full_path)                    
print('Terminé!')

Terminé!


In [3]:
#Lecture des fichiers .JSON

names = {}
for name in os.listdir(DIR):
    file_names = os.listdir(DIR+'/'+name)
    for file in file_names:
        with open(DIR+'/'+name+'/'+file, 'r', encoding='utf-8') as myfile:
            data = myfile.read()
            if name in names.keys():
                names[name].append(json.loads(data))
            else:
                names[name] = [json.loads(data)]

In [4]:
#Extraction de tous les messages dans une liste

messages = []
for i in names.values():
    messages.extend([m for j in i for m in j["messages"] ])

In [5]:
#Conversion en DataFrame

df = pd.DataFrame(messages)

In [6]:
#Filtre sur message reçu donc on enlève les messages que nous avons envoyé

df = df[df["sender_name"]!=USER_NAME]

In [7]:
#Tri par temps et conversion timestamp en date

df = df.sort_values(by='timestamp_ms')
df = df.reset_index(drop=True)

df["Date"] = pd.to_datetime(df["timestamp_ms"], unit="ms").dt.date
df = df[["sender_name", "Date"]]
df["Count"] = np.nan

In [8]:
#Compteur sur les messages

count = {}
for i in range(len(df)):
    name = df.at[i,"sender_name"]
    if name in count.keys():
        count[name] += 1
    else:
        count[name] = 1
    df.at[i,"Count"] = count[name]
    
df["Count"] = df["Count"].astype(int)

In [None]:
#Fonction pour la boucle de date depuis le début des messages jusqu'au dernier message reçu

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)

In [None]:
#Création liste pour la résultante des messages par jour pour chaque personne

final_df = []
begin = df["Date"][0]
end = df["Date"][len(df)-1]
names = {}
for name in df["sender_name"].unique():
    tmp = df[df["sender_name"]==name]
    for date in daterange(begin,end):
        tmp2 = tmp[tmp["Date"]==date]
        if len(tmp2) == 0:
            if name in names.keys():
                final_df.append({"Count":names[name],"Date":date,"sender_name":name})
            else:
                final_df.append({"Count":0,"Date":date,"sender_name":name})
        else:
            ind = tmp2.index[-1]
            count = tmp2.at[ind,"Count"]
            final_df.append({"Count":count,"Date":date,"sender_name":name})
            names[name] = count

In [None]:
#Convertion en DataFrame

final_df = pd.DataFrame(final_df)

In [None]:
#Vérification du total des lignes

if len(final_df["Date"].unique()) * len(final_df["sender_name"].unique()) == len(final_df):
    print("OK!")
else:
    print("Problem")

OK!


In [None]:
df = final_df
del final_df

In [None]:
#Création du fichier .mp4 Bar Chart Race 
#Code from https://medium.com/@6berardi/how-to-create-a-smooth-bar-chart-race-with-python-ad2daf6510dc

frames_list = df["Date"].unique().tolist()
for i in range(10):
    frames_list.append(df['Date'].iloc[-1])

def transform_color(color, amount = 0.5):

    try:
        c = mc.cnames[color]
    except:
        c = color
        c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

all_names = df['sender_name'].unique().tolist()
random_hex_colors = []
for i in range(len(all_names)):
    random_hex_colors.append('#' + '%06X' % randint(0, 0xFFFFFF))

rgb_colors = [transform_color(i, 1) for i in random_hex_colors]
rgb_colors_opacity = [rgb_colors[x] + (0.825,) for x in range(len(rgb_colors))]
rgb_colors_dark = [transform_color(i, 1.12) for i in random_hex_colors]

fig, ax = plt.subplots(figsize = (36, 20))

num_of_elements = 8

def draw_barchart(Time):
    df_frame = df[df['Date'].eq(Time)].sort_values(by = 'Count', ascending = True).tail(num_of_elements)
    ax.clear()

    normal_colors = dict(zip(df['sender_name'].unique(), rgb_colors_opacity))
    dark_colors = dict(zip(df['sender_name'].unique(), rgb_colors_dark))
    
    ax.barh(df_frame['sender_name'], df_frame['Count'], color = [normal_colors[x] for x in df_frame['sender_name']], height = 0.8,
            edgecolor =([dark_colors[x] for x in df_frame['sender_name']]), linewidth = '6')

    dx = float(df_frame['Count'].max()) / 200

    for i, (value, name) in enumerate(zip(df_frame['Count'], df_frame['sender_name'])):
        ax.text(value + dx, i + (num_of_elements / 50), '    ' + name.split(" ")[0],
        size = 36, weight = 'bold', ha = 'left', va = 'center', fontdict = {'fontname': 'Trebuchet MS'})
        ax.text(value + dx, i - (num_of_elements / 50),     str(value), size = 36, ha = 'left', va = 'center')

    time_unit_displayed = re.sub(r'\^(.*)', r'', str(Time))
    ax.text(1.0, 1.14, time_unit_displayed, transform = ax.transAxes, color = '#666666',
            size = 62, ha = 'right', weight = 'bold', fontdict = {'fontname': 'Trebuchet MS'})
    ax.text(-0.005, 1.06, '# messages', transform = ax.transAxes, size = 30, color = '#666666')
    ax.text(-0.005, 1.14, 'Number of messages from 2008 to 2020', transform = ax.transAxes,
            size = 62, weight = 'bold', ha = 'left', fontdict = {'fontname': 'Trebuchet MS'})

    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis = 'x', colors = '#666666', labelsize = 28)
    ax.set_yticks([])
    ax.set_axisbelow(True)
    ax.margins(0, 0.01)
    ax.grid(which = 'major', axis = 'x', linestyle = '-')

    plt.locator_params(axis = 'x', nbins = 4)
    plt.box(False)
    plt.subplots_adjust(left = 0.075, right = 0.75, top = 0.825, bottom = 0.05, wspace = 0.2, hspace = 0.2)

animator = animation.FuncAnimation(fig, draw_barchart, frames = frames_list)
animator.save("Racing Bar Chart 2008-2019.mp4", fps = 20, bitrate = 1800)