In [None]:
import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patheffects as pe
from matplotlib.pyplot import cm
import matplotlib.gridspec as gridspec
import numpy as np
import math
import re
from ipywidgets import IntRangeSlider, Output, Checkbox, HBox, VBox, RadioButtons, Button, Label, Layout, Text
from IPython.display import display, clear_output
!jupyter nbextension enable --py widgetsnbextension

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("wordnet")
nltk.download("vader_lexicon")

<h1>Initialisation Functions</h1>
Read and convert chat data into organised structures. Define properties and helper functions for generating analytics.

In [None]:
chat_type = "apple" # "apple" or "android"
filename = "chat.txt"
stopwords_file = "stopwords.txt"
figscale = 1

<h3>read chat data</h3>

In [None]:
chat = open(filename,"r", encoding="utf8").read()
if(chat_type=="apple"):
    lines = chat.split("\n")[3:]
elif(chat_type=="android"):
    lines = chat.split("\n")

chat_list = []
media_list = []

for l,line in enumerate(lines):
    ascii = line.encode("ascii", errors="ignore").decode() # keep only ascii characters
    try: # determine whether line is new chat or continuation of previous chat
        if(chat_type=="apple"): # first line for apple determined by square brackets surrounding message date
            firstline = ascii[0]=="[" and ascii[21]=="]"
        elif(chat_type=="android"): #first line for android determined by comma separating date and time, and dash separating time and author
            firstline = ascii[10]=="," and ascii[18]=="-"
    except:
        firstline = False
    
    if(firstline):
        try: # ignore any broken chats
            if(chat_type=="apple"):
                date = datetime.datetime(int(ascii[7:11]),int(ascii[4:6]),int(ascii[1:3]),int(ascii[13:15]),int(ascii[16:18]),int(ascii[19:21]))
                author = ascii[23:ascii.find(":",23)]
                content = ascii[ascii.find(":",23)+2:]
            elif(chat_type=="android"):
                if(ascii.find(":",20)==-1): # ignore message if no author
                    continue
                date = datetime.datetime(int(ascii[6:10]),int(ascii[3:5]),int(ascii[0:2]),int(ascii[12:14]),int(ascii[15:17]))
                author = ascii[20:ascii.find(":",20)]
                content = ascii[ascii.find(":",20)+2:]
            
            if(content=="image omitted" or content=="video omitted" or content=="sticker omitted" or content=="audio omitted" or content=="GIF omitted" or content=="<Media omitted>"):
                media_list.append({"date":date,"auth":author,"cont":content})
            else:
                chat_list.append({"date":date,"auth":author,"cont":content})
        except:
            pass
    else: # if line is continuation of previous chat, append content to its entry
        chat_list[-1]["cont"] += (" "+ascii)

<h3>separate chat data by author</h3>

In [None]:
author_chats = {}
for chat in chat_list:
    author_chats.setdefault(chat["auth"], []).append(chat)

author_media = {}
for media in media_list:
    author_media.setdefault(media["auth"], []).append(media)

for author in author_chats.keys(): # fill any gaps in dictionaries for authors which have either only sent chats or only sent media
    if(author not in author_media.keys()):
        author_media[author] = []
for author in author_media.keys():
    if(author not in author_chats.keys()):
        author_chats[author] = []

authors = list(author_chats.keys())
if(chat_type=="apple"):
    medias = {"image omitted":"Image",
               "video omitted":"Video",
               "sticker omitted":"Sticker",
               "audio omitted":"Audio",
               "GIF omitted":"GIF",}
elif(chat_type=="android"): # android chat does not differentiate between media types
    medias = {"media omitted":"Media"}

<h3>define colours for plots</h3>

In [None]:
colours = {"total":"slategrey",
           "messages_per_day":"tab:blue",
           "words_per_message":"tab:purple",
           "Image":"dodgerblue",
           "Video":"darkslateblue",
           "Sticker":"palevioletred",
           "Audio":"mediumseagreen",
           "GIF":"goldenrod",
           "Media":"goldenrod",
           "occurrences":"tab:red",
           "alltime_lines":"indianred",
           "major_grid":"gainsboro",
           "minor_grid":"whitesmoke",
           "year_grid":"silver",
           None:"tab:blue"}

author_colours = {} # generate distinct colours for each author
for a,author in enumerate(authors):
    author_colours[author] = cm.get_cmap("tab10")(a)

<h3>create dates for all timescales</h3>

In [None]:
total_days = (chat_list[-1]["date"].date() - chat_list[0]["date"].date()).days + 1
day_dates = [chat_list[0]["date"].date() + datetime.timedelta(days=i) for i in range(total_days)]

for d,date in enumerate(day_dates): # find position of first monday in dates list
    if(date.weekday()==0):
        first_monday=d
        break
week_dates = [day_dates[first_monday] + datetime.timedelta(days=i) for i in range(0,total_days,7)] # week dates are the first day, then every subsequent monday
if(first_monday!=0):
    week_dates = [day_dates[0]] + week_dates
total_weeks = len(week_dates)

total_months = (chat_list[-1]["date"].year - chat_list[0]["date"].year) * 12 + chat_list[-1]["date"].month - chat_list[0]["date"].month + 1
month_dates = [chat_list[0]["date"].date()] + [datetime.date(chat_list[0]["date"].year,chat_list[0]["date"].month,1) + relativedelta(months=i) for i in range(1,total_months)]

total_years = chat_list[-1]["date"].year - chat_list[0]["date"].year + 1
year_dates = [chat_list[0]["date"].date()] + [datetime.date(chat_list[0]["date"].year,1,1) + relativedelta(years=i) for i in range(1,total_years)]

dates = {"Days":day_dates,"Weeks":week_dates,"Months":month_dates,"Years":year_dates}
first_date = chat_list[0]["date"].date()

<h3>convert data to appropriate timescale and trim to date range</h3>

In [None]:
def calculate_index(i): # calculates index of new timescale relative to index of day timescale
    if(timescale=="Weeks"):
        return math.floor(((datetime.timedelta(days=i)).days + 7-first_monday)/7) # week is calculated as 7 days from monday to monday
    elif(timescale=="Months"):
        return ((first_date + datetime.timedelta(days=i)).year - first_date.year) * 12 + (first_date + datetime.timedelta(days=i)).month - first_date.month # month is calculated as (year difference)*12 + (month difference)
    elif(timescale=="Years"):
        return (first_date + datetime.timedelta(days=i)).year - first_date.year

def scale(data, average=False, author=None): # convert day sums to sums of any other timescale
    if(timescale=="Days"): # don't convert if already days
        return data[date_range[0]:date_range[1]+1]
    
    if(isinstance(data[0], list)): # creates a new list object in the same shape as the old one but condensed in time- either list or list of lists
        new_timescale = [[0]*len(data[0]) for _ in range(len(dates[timescale]))]
    else:
        new_timescale = [0]*len(dates[timescale])

    if(average): # average scaling is the average of all data values belonging to new timescale weighted by content sums of the given day
        if(author==None): # daily message sums of total of all authors
            sums = [sum(day) for day in zip(*[content_sums[author]["Message"] for author in authors])]
        else: # daily message sums for given author
            sums = content_sums[author]["Message"]
        new_timescale_sums = [0]*len(dates[timescale]) # sums also become scaled to new timescale

        for i in range(len(data)):
            new_timescale[calculate_index(i)] += data[i] * sums[i]
            new_timescale_sums[calculate_index(i)] += sums[i]
        
        new_timescale = [timeperiod[0]/timeperiod[1] if timeperiod[1]!=0 else 0 for timeperiod in zip(new_timescale,new_timescale_sums)] # divide each timeperiod of new timescale by the sums of messages of given timeperiod

    else: # non-average scaling is the sum of all data values belonging to new timescale
        for i in range(len(data)):
            if(isinstance(data[i], list)): # element-wise sum of all day lists belonging to new timescale (individual sum of 12AMs, 1AMs etc of all days)
                new_timescale[calculate_index(i)] = [x[0]+x[1] for x in zip(new_timescale[calculate_index(i)], data[i])]
            else: # sum all days belonging to new timescale
                new_timescale[calculate_index(i)] += data[i]
    
    return new_timescale[date_range[0]:date_range[1]+1]

<h1>Calculation functions</h1>
Convert per-author chat data into a per-author daily representation of a statistic. Later used by plot functions to turn into finalised statistics ready for plotting.

<h3>calculate daily sums of all separate content types</h3>

In [None]:
def calculate_content_sums():
    content_sums = {} # list of sums of content for a given timescale, for all authors
    for author in author_chats:
        if(chat_type=="apple"): # individual list of sums for every message type
            sums = {media:[0]*len(dates["Days"]) for media in ["Message","Image","Video","Sticker","Audio","GIF"]}
        else:
            sums = {media:[0]*len(dates["Days"]) for media in ["Message","Media"]}
        
        for message in author_chats[author]: # sum messages
            sums["Message"][(message["date"].date() - dates["Days"][0]).days] += 1
        
        for media in author_media[author]: # sum media
            sums[medias[media["cont"]]][(media["date"].date() - dates["Days"][0]).days] += 1
        
        content_sums[author] = sums

    return content_sums

<h3>calculate daily sums of words in messages</h3>

In [None]:
def calculate_word_sums():
    word_sums = {} # daily sums of words for every author
    for author in author_chats:
        sums = [0]*len(dates["Days"])
        for message in author_chats[author]:
            sums[(message["date"].date() - dates["Days"][0]).days] += len(message["cont"].split(" ")) # adds number of words in message to daily sum
        word_sums[author] = sums
    return word_sums

<h3>calculate daily sums of messages sent for every hour</h3>

In [None]:
def get_messages_by_hour():
    messages_by_hour = {} # daily messages by hour for each author
    for author in author_chats:
        sums = [[0]*24 for i in range(len(dates["Days"]))] # 24 hour list for each day
        for message in author_chats[author]:
            sums[(message["date"].date() - dates["Days"][0]).days][message["date"].hour] += 1
        messages_by_hour[author] = sums
    
    return messages_by_hour

<h3>calculate daily occurrences of a given search key</h3>

In [None]:
def calculate_occurrences():
    global search
    search = re.sub(r'[^a-zA-Z0-9\s]', '', search.lower()) # search key is alphanumeric lowercase
    occurrences = {} # occurrences for each day for each author

    for author in author_chats:
        sums = [0]*len(dates["Days"])
        for message in author_chats[author]:
            message_alphanum = re.sub(r'[^a-zA-Z0-9\s]', '', message["cont"].lower()) # message is alphanumeric lowercase
            sums[(message["date"].date() - dates["Days"][0]).days] += message_alphanum.count(search) # add occurrences of the search key
        occurrences[author] = sums
    
    return occurrences

<h3>calculate daily sentiment</h3>

In [None]:
def calculate_sentiment():
    sid = SentimentIntensityAnalyzer()
    sentiment = {}

    for author in author_chats:
        sums = [0]*len(dates["Days"])
        weights = [0]*len(dates["Days"])
        for message in author_chats[author]:
            weight = len(re.split('[-.,:;!? ]', message["cont"]))**0.5 # longer message receives an amplified sentiment score, with diminishing returns
            score = sid.polarity_scores(message["cont"])["compound"] * weight # calculate sentiment of message- weighted
            sums[(message["date"].date() - dates["Days"][0]).days] += score
            weights[(message["date"].date() - dates["Days"][0]).days] += weight

        sentiment[author] = [sums[i]/weights[i] if weights[i]!=0 else 0 for i in range(len(sums))] # divide each day of sentiment by day sum of weights to bring back to range of -1,1

    return sentiment

<h3>calculate most commonly used or most representative words</h3>

In [None]:
def calculate_words():
    lemmatizer = WordNetLemmatizer()
    stop_words = set([re.sub(r'[^a-zA-Z0-9\s]', '', word) for word in open(stopwords_file, "r").read().split("\n")]) # filters common "meaningless" words
    lemma_exceptions = {"cos","cus"} # words which are exempt from lemmatisation


    word_ranking = {} # ranking of words for each author. either frequency or representativeness
    for author in author_chats:
        ranking = {}
        for message in author_chats[author]:
            if(message["date"].date()>=dates[timescale][date_range[0]] and message["date"].date()<=dates[timescale][date_range[1]]):
                added_lemmas = {} # dictionary storing how many of which lemmas have been added in the current message. used for scaling down the value of subsequent lemmas
                words = re.split('[-.,:;!? ]', message["cont"]) # split message into words by any punctuation
                for word in words:
                    word_alphanum = re.sub(r'[^a-zA-Z0-9\s]', '', word.lower()) # convert word to only contain lowercase letters and numbers and spaces
                    if(word_alphanum != "" and word_alphanum not in stop_words):
                        if(word_alphanum not in lemma_exceptions):
                            lemma = lemmatizer.lemmatize(word_alphanum)
                        else:
                            lemma = word_alphanum
                        
                        if(word_type=="Representativeness"): # if representativeness calculation, subsequent lemmas within the same message are scaled down in value
                            added_lemmas[lemma] = added_lemmas.get(lemma,0) + 1
                            scaled = added_lemmas[lemma]**-0.5
                            ranking[lemma] = ranking.get(lemma,0) + scaled
                        else: # if frequency calculation, every lemma is worth the same
                            ranking[lemma] = ranking.get(lemma,0) + 1
        
        word_ranking[author] = dict(sorted(ranking.items(), key=lambda item: item[1], reverse=True)) # sorts dictionary by highest ranking

    if(word_type=="Representativeness"):
        total_word_ranking = {} # total ranking is sum of author rankings
        for author in word_ranking:
            for word in word_ranking[author]:
                total_word_ranking[word] = total_word_ranking.get(word,0) + word_ranking[author][word]
        total_word_ranking = dict(sorted(total_word_ranking.items(), key=lambda item: item[1], reverse=True)) # sorts dictionary by highest ranking

        for author in word_ranking:
            for word in word_ranking[author]:
                word_ranking[author][word] = math.log(word_ranking[author][word]) * (word_ranking[author][word]/total_word_ranking[word]) # author word usage over total usage scaled by log of author usage
            
            word_ranking[author] = dict(sorted(word_ranking[author].items(), key=lambda item: item[1], reverse=True)) # sorts dictionary by highest ranking
            max_value = list(word_ranking[author].values())[0]
            word_ranking[author] = {k: v / max_value for k, v in word_ranking[author].items()}

    return word_ranking

<h1>Plot helper functions</h1>
Implement generic plot types (line plot, bar plot etc.) which visualise input data. Called by plot functions to create individual consitiuent axes of a full visualisation.

<h3>line plot</h3>

In [None]:
def line_ax(ax, data, labels, bipolar, title):
    date_axis = dates[timescale][date_range[0]:date_range[1]+1]

    if(len(date_axis)>50): # display point markers if little enough data
        marker="none"
    else:
        marker="."

    max_val = max([max([abs(point) for point in line]) for line in data]) if max([max([abs(point) for point in line]) for line in data])!=0 else 1 # maximum value in all data for axis limits
    
    for l,line in enumerate(data):
        if(labels==[None]): # line colour, either total colour or author colour
            colour = colours["total"]
        else:
            colour = author_colours[labels[l]]
        
        ax.plot(date_axis, line, color=colour, label=labels[l], marker=marker, linewidth=1, zorder=3) # plot data
        if(len(data)==1): # fill underneath line if one data point
            ax.fill_between(date_axis, line, alpha=0.1, color=colour)

    if(labels!=[None]): # plot legend if labels present
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.125), fontsize=8, ncol=100)
    
    ax.title.set_text(title)
    if(not bipolar):
        ax.set_ylim([0, max_val*1.05])
    else:
        ax.axhline(y=0, linewidth=0.5, color="black", zorder=3)
        ax.set_ylim([-max_val*1.05, max_val*1.05])
    ax.minorticks_on()
    ax.xaxis.set_tick_params(which='minor', bottom=False)
    ax.grid(True,zorder=0,color=colours["major_grid"],linewidth = 1)
    ax.grid(True, zorder=0,color=colours["minor_grid"], which='minor')
    ax.xaxis.grid(which='minor', visible=False)
    for year in dates["Years"]: # year grids
        ax.axvline(x=year, color=colours["year_grid"], zorder=2, linewidth=1)
    if(date_range[0]!=date_range[1]):
        ax.set_xlim([min(date_axis), max(date_axis)]) # set x axis to date limits (if more than 1 point)

<h3>bar plot</h3>

In [None]:
def labeltext(value,ratio): # generate the label text
    prefix=""
    if(ratio): 
        prefix="1:"
        if(value!=0):
            value=1/value
        else:
            return prefix+"∞"
    return prefix+str(f'{float(f"{value:.3g}"):g}')

def addlabels(ax,data,alltime_data,ratio,date): # adds a label for the value rounded to 3 significant figures
    width = 0.9 / len(data)
    shift = np.linspace(width*(len(data)-1)/2, -width*(len(data)-1)/2, len(data)) # calculate y offset for multiple bars

    for bs, bars in enumerate(data): # iterate through every collection of bars (for multi-bar plots)
        for b, bar in enumerate(bars): # iterate through every bar
            if(max(max(data))!=0):
                if(bar/max(max(data))>0.2): # label inside of bar if enough space
                    colour="white"
                    align="right"
                else: # label outside of bar if not enough space
                    colour="black"
                    align="left"
            else:
                colour="black"
                align="left"
            
            if(date_range!=(0,len(dates[timescale])-1) and date==True): # write fraction of all-time value if subsection of date selected
                label = labeltext(bar,ratio)+" / "+labeltext(alltime_data[bs][b],ratio)
            else:
                label = labeltext(bar,ratio)
            
            ax.text(bar, b+shift[bs]-0.025, label, va="center", ha=align, color=colour, fontsize=7)

def bar_ax(ax, title, data, alltime_data, authors, ratio, colour_ids, labels, bipolar):
    ax.title.set_text(title)
    if(len(data)!=0):
        y = np.arange(len(authors)) # y axis is number of authors
        width = 0.9 / len(data)
        shift = np.linspace(width*(len(data)-1)/2, -width*(len(data)-1)/2, len(data)) # calculate y offset for multiple bars

        bar_charts = []
        
        for i,item in enumerate(data): # plot all bars
            bar_charts.append(ax.barh(y+shift[i], item, width, color=colours[colour_ids[i]], label=labels[i], zorder=2))
        
        if(colour_ids==[None]): # if colours not provided, sets each individual bar to author colour (guaranteed only one bar per author)
            colour_ids = [author_colours[a] for a in display_authors]
            for i in range(len(bar_charts[0])):
                bar_charts[0][i].set_color(colour_ids[i])
        
        if(date_range!=(0,len(dates[timescale])-1)): # draw all-time lines if subsection of date selected
            for bs, bars in enumerate(bar_charts):
                for b, bar in enumerate(bars):
                    ax.vlines(alltime_data[bs][b],ymin=bar.get_y(),ymax=bar.get_y() + bar.get_height(),colors='white',linewidth=0.5,path_effects=[pe.Stroke(linewidth=2, foreground=colours["alltime_lines"]), pe.Normal()])
        
        max_val = max([max([abs(point) for point in bar]) for bar in data+alltime_data]) if max([max([abs(point) for point in bar]) for bar in data+alltime_data])!=0 else 1 # maximum value in all data and alltime data combined for axis limit
        ax.set_xlim(0,max_val*1.05)

        if(bipolar): # set axis limit to maximum on both sides, centred at 0
            ax.set_xlim(-max_val*1.05,max_val*1.05)
            ax.axvline(x=0, linewidth=0.5, color="black")
        
        ax.xaxis.grid(color=colours["major_grid"])
        ax.set_yticks(y)
        ax.set_yticklabels(authors)
        ax.set_yticks(np.arange(-0.5, len(authors), 1), minor=True)
        ax.tick_params(axis='y', which='minor', length=0)
        ax.grid(which='minor', color=colours["major_grid"], zorder=0)
        if(None not in labels):
            ax.legend(fontsize=8)
        
        addlabels(ax, data, alltime_data, ratio, True)

<h3>polar plot</h3>

In [None]:
def plot_polar(fig, data, labels, clock, total):
    if(clock=="pm"):
        size = [1.2, 0, 1, 1] # plot size
        tick_labels = range(12,24) # tick labels
        ring = -3 # inner ring size
    else:
        size = [1.4, 0.2, 0.6, 0.6] # plot size
        tick_labels = range(0,12) # tick labels
        ring = -1.4 # inner ring size
    if(not total): # total and individual rings are placed side-by-side
        size[0]+=1.2

    ax = fig.add_axes(size, polar=True) # place inner plot of size dependent on whether inner am or outer pm

    ax.set_rorigin(ring) # inner ring size
    ax.set_yticks([]) # hide grid - added manually to be polygonal
    ax.set_ylim([0,1]) # plots values of range 0-1
    ax.set_xticks(np.linspace(0, 2*np.pi, 12, endpoint=False)) # 12 ticks
    ax.set_xticklabels(tick_labels) # tick labels
    ax.tick_params(pad=-2.5) # label distance from ticks
    ax.set_theta_direction(-1) # clockwise
    ax.set_theta_offset(np.pi/2.0) # start from top
    ax.spines['polar'].set_visible(False) # hide outer border - added manually to be polygonal
    
    theta = np.linspace(0, 2*np.pi, 13) # plot new polygonal gridlines with 5 segments
    for i in range(0,6):
        if(i==0 or i==5):
            ax.plot(theta, [i/5]*13, color='black', linewidth=1)
        else:
            ax.plot(theta, [i/5]*13, color=colours["major_grid"], linewidth=0.5)
    ax.axvline(x = 0, color='black') # axis break at 12pm/am

    for i,item in enumerate(data): # plot lines
        if(total): # line colour
            colour = colours["total"]
        else:
            colour = author_colours[labels[i]]

        if(clock=="pm"):
            hours = item[12:24] + [item[0]] # 12pm-12am if pm
        else:
            hours = item[0:13] # 12am-12pm if am
        ax.plot(theta, hours, marker='.', markevery=range(0,12), color=colour, label=labels[i], linewidth=1)
        if(len(data)==1):
            ax.fill(theta, hours, alpha=0.1, color=colour)
    
    return ax

<h3>list plot</h3>

In [None]:
def list_ax(ax,words,freqs,title,word_colour):
    ax.barh(words,freqs,color=word_colour) # plot
    ax.title.set_text(title)
    ax.invert_yaxis() # top to bottom
    ax.set_ylim([display_amount,-1]) # prevent axis padding
    ax.tick_params(axis='y', which='minor', length=0)
    ax.axes.get_xaxis().set_ticks([])
    addlabels(ax,[freqs],None,False,False)
    if(len(words)==0): # hide tick labels if no words
        ax.set_yticklabels([])
    ax.set_yticks(np.arange(-0.5, display_amount, 1), minor=True)
    ax.grid(which='minor', color=colours["major_grid"], zorder=0)
    for j in range(display_amount): # word ranking label
        ax.text(ax.get_xlim()[1],j,j+1, va="center", ha="right", color=colours["year_grid"], fontsize=7, zorder=0)
    for j in range(display_amount,0,-10): # separator for every 10 words
        ax.axhline(y=j-1.5, color=colours["year_grid"], zorder=2, linewidth=1)

<h1>Plot functions</h1>
Initialise plot objects, convert daily representation of statistic into finalised statistic, and call plot helper functions to fill axes and create a full visualisation.

<h3>plot total and per-author sums of content as 2 line plots</h3>

In [None]:
def plot_content():
    fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(12*figscale,4*figscale))

    # individual data is one list of all content types summed together per author. total data is the sum of all authors.
    individual_data = [[sum(day) for day in zip(*[scale(content_sums[author][message_type]) for message_type in display_content])] for author in display_authors]
    all_authors =     [[sum(day) for day in zip(*[content_sums[author][message_type] for message_type in display_content])] for author in authors]
    total_data =      scale([sum(day) for day in zip(*all_authors)])
    
    line_ax(ax[0], [total_data], [None], False, "Total Content")
    line_ax(ax[1], individual_data, display_authors, False, "Individual Content")

    fig.tight_layout()
    plt.show()

<h3>plot messages per day, words per message, and media to message ratio as 3 bar plots</h3>

In [None]:
def plot_average_content():
    # calculate daily sums of messages and words, per author, as a list
    messagesums = [sums["Message"] for author,sums in content_sums.items() if author in display_authors]
    wordsums    = [sums for author,sums in word_sums.items() if author in display_authors]
    mediasums   = [[sums[media] for media in display_media] for author,sums in content_sums.items() if author in display_authors]
    num_days    = (dates[timescale][date_range[1]]-dates[timescale][date_range[0]]).days+1

    # messages per day is the sum of all messages divided by number of days, per author
    messages_per_day         = [sum(author)/num_days for author in [scale(sum) for sum in messagesums]]
    alltime_messages_per_day = [sum(author)/total_days for author in messagesums]

    # words per message is the sum of all words divided by the sum of all messages, per author
    words_per_message         = [sum(author[1])/sum(author[0]) for author in zip([scale(sum) for sum in messagesums],[scale(sum) for sum in wordsums])]
    alltime_words_per_message = [sum(author[1])/sum(author[0]) for author in zip(messagesums,wordsums)]

    # message to media ratio is the sum of media types divided by the sum of all messages, per media, per author
    media_to_message = [[0]*len(display_authors) for i in range(len(display_media))]
    for i in range(len(display_media)):
        for j in range(len(display_authors)):
            media_to_message[i][j] = sum([scale(sum) for sum in mediasums[j]][i])/sum([scale(sum) for sum in messagesums][j])
    alltime_media_to_message = [[0]*len(display_authors) for i in range(len(display_media))]
    for i in range(len(display_media)):
        for j in range(len(display_authors)):
            alltime_media_to_message[i][j] = sum(mediasums[j][i])/sum(messagesums[j])

    initials = [a[0] for a in display_authors] # all charts after first display only first letter of author

    fig, ax = plt.subplots(ncols=3, figsize=(12*figscale,4*figscale))

    bar_ax(ax[0], "Average Daily Messages",    [messages_per_day],  [alltime_messages_per_day],  display_authors,  False, ["messages_per_day"],  [None],        False)
    bar_ax(ax[1], "Average Words per Message", [words_per_message], [alltime_words_per_message], initials,         False, ["words_per_message"], [None],        False)
    bar_ax(ax[2], "Media to Message Ratio",    media_to_message,    alltime_media_to_message,    initials,         True,  display_media,         display_media, False)

    fig.tight_layout()
    plt.show()

<h3>plot total and per-author messages by hour as 2 sets of AM-PM polar plots</h3>

In [None]:
def plot_messages_by_hour():
    fig = plt.figure(figsize=(4*figscale, 4*figscale))

    # individual data is one list of sums of all hourly sums, normalised so highest value is 1, per author
    individual_data = [[sum(day) for day in zip(*scale(messages_by_hour[author]))] for author in display_authors]
    individual_data_normalised = [[element/max(author) if max(author)!=0 else 0 for element in author] for author in individual_data]

    # toatl data is one list of sums of all hourly sums, summed across all authors, normalised so highest value is 1
    all_authors = [[sum(day) for day in zip(*scale(messages_by_hour[author]))] for author in authors]
    total_data = [sum(author) for author in zip(*all_authors)]
    total_data_normalised = [element/max(total_data) if max(total_data)!=0 else 0 for element in total_data]

    ax = plot_polar(fig,[total_data_normalised],[None],"pm",True)
    ax.set_title("Total Messages by Hour")
    plot_polar(fig,[total_data_normalised],[None],"am",True)

    ax = plot_polar(fig,individual_data_normalised,display_authors,"pm",False)
    ax.set_title("Individual Messages by Hour")
    ax.legend(loc='center right', bbox_to_anchor=(1.425, 0.5), fontsize=8)
    plot_polar(fig,individual_data_normalised,display_authors,"am",False)

    plt.show()

<h3>plot total and per-author occurrences of a given search key as 2 line plots and a summed bar plot</h3>

In [None]:
def plot_occurrences():
    fig = plt.figure(figsize=(12*figscale,4*figscale))
    gs = gridspec.GridSpec(2,7)
    ax1 = plt.subplot(gs[0, 0:5])
    ax2 = plt.subplot(gs[1, 0:5])
    ax3 = plt.subplot(gs[:, 5:])

    if(search_type=="Absolute"):
        title1 = "Total Occurrences of '{search}'".format(search=search)
        title2 = "Individual Occurrences of '{search}'".format(search=search)
        title3 = "Occurrences of '{search}'".format(search=search)
        
        # absolute individual data is occurrences per author. total data is sum of occurrences across all authors. bar data is sum of all days per author
        individual_data =  [scale(occurrences[author]) for author in display_authors]
        total_data =       [sum(author) for author in zip(*individual_data)]
        bar_data =         [sum(scale(occurrences[author])) for author in display_authors]
        alltime_bar_data = [sum(occurrences[author]) for author in display_authors]

    elif(search_type=="Ratio"):
        title1 = "Total Occurrences of '{search}' to Message Ratio".format(search=search)
        title2 = "Individual Occurrences of '{search}' to Message Ratio".format(search=search)
        title3 = "Occurrences of '{search}' to Message Ratio".format(search=search)

        # individual ratio data is occurrences divided by messages for a given day, per author
        individual_data = [[day[0]/day[1] if day[1]!=0 else 0 for day in zip(scale(occurrences[author]),scale(content_sums[author]["Message"]))] for author in display_authors]

        # total ratio data is occurrences summed across all authors divided by messages summed across all authors
        total_occurrences = [sum(day) for day in zip(*[occurrences[author] for author in authors])]
        total_messages =    [sum(day) for day in zip(*[content_sums[author]["Message"] for author in authors])]
        total_data =        [day[0]/day[1] if day[1]!=0 else 0 for day in zip(scale(total_occurrences),scale(total_messages))]

        # bar data is sum of all occurrences divided by sum of all messages per author
        bar_data =         [sum(scale(occurrences[author]))/sum(scale(content_sums[author]["Message"])) for author in display_authors]
        alltime_bar_data = [sum(occurrences[author])/sum(content_sums[author]["Message"]) for author in display_authors]
    
    line_ax(ax1, [total_data], [None], False, title1)
    line_ax(ax2, individual_data, display_authors, False, title2)

    initials = [a[0] for a in display_authors]
    bar_ax(ax3, title3, [bar_data], [alltime_bar_data], initials, search_type=="Ratio", [None], [None], False)

    plt.setp(ax1.get_xticklabels(), visible=False)
    gs.tight_layout(fig)
    plt.show()

<h3>plot total and per-author sentiment as 2 line plots and a summed bar plot</h3>

In [None]:
def plot_sentiment():
    fig = plt.figure(figsize=(12*figscale,4*figscale))
    gs = gridspec.GridSpec(2,7)
    ax1 = plt.subplot(gs[0, 0:5])
    ax2 = plt.subplot(gs[1, 0:5])
    ax3 = plt.subplot(gs[:, 5:])

    individual_data = [scale(sentiment[author], average=True, author=author) for author in display_authors]

    # total data is total sentiment divided by total number of messages. total sentiment is the daily author sentiment multiplied by daily author messages, summed across all authors. multiplicaiton by author messages happens first, and division by total messages happens last, to ensure different authors are weighted differently during summation, while retaining final -1,1 range of sentiment
    total_sentiment = [sum(author) for author in zip(*[[day[0]*day[1] for day in zip(sentiment[author],content_sums[author]["Message"])] for author in authors])]
    total_messages = [sum(day) for day in zip(*[content_sums[author]["Message"] for author in authors])]
    total_data = scale([day[0]/day[1] if day[1]!=0 else 0 for day in zip(total_sentiment,total_messages)], average=True)

    # bar data is daily sentiment multiplied by daily messages, all divided by sum of messages
    alltime_bar_data = [sum([day[0]*day[1] for day in zip(sentiment[author],content_sums[author]["Message"])])/sum(content_sums[author]["Message"]) for author in display_authors]
    bar_data =         [sum([day[0]*day[1] for day in zip(scale(sentiment[author], average=True),scale(content_sums[author]["Message"]))])/sum(scale(content_sums[author]["Message"])) for author in display_authors]

    line_ax(ax1, [total_data], [None], True, "Total Sentiment")
    line_ax(ax2, individual_data, display_authors, True, "Individual Sentiment")

    initials = [a[0] for a in display_authors]
    bar_ax(ax3, "Sentiment", [bar_data], [alltime_bar_data], initials, False, [None], [None], True)

    plt.setp(ax1.get_xticklabels(), visible=False)
    gs.tight_layout(fig)
    plt.show()

<h3>plot most commonly used or most representative words as total and one-per-author list plots</h3>

In [None]:
def plot_words():
    if(word_type=="Representativeness"): # do not display total ranking if displaying representativeness
        columns = len(display_authors)
        title = "Top {amount} Most Representative Words".format(amount=display_amount)
        total_freqs = []
        total_words = []
        total_title = []
        total_colour = []
    else:
        total_word_ranking = {} # total ranking is sum of author rankings
        for author in word_ranking:
            for word in word_ranking[author]:
                total_word_ranking[word] = total_word_ranking.get(word,0) + word_ranking[author][word]
        total_word_ranking = dict(sorted(total_word_ranking.items(), key=lambda item: item[1], reverse=True)) # sorts dictionary by highest ranking
        
        columns = len(display_authors)+1
        title = "Top {amount} Most Frequent Words".format(amount=display_amount)
        total_freqs = [list(total_word_ranking.values())[0:display_amount]]
        total_words = [list(total_word_ranking.keys())[0:display_amount]]
        total_title = ["Total"]
        total_colour = [colours["total"]]
    
    fig, ax = plt.subplots(ncols=columns, figsize=(12*figscale,0.175*figscale*display_amount+1))

    freqs = total_freqs + [list(word_ranking[author].values())[0:display_amount] for author in display_authors]
    words = total_words + [list(word_ranking[author].keys())[0:display_amount] for author in display_authors]
    words = [[word[0:8]+".." if len(word)>9 else word for word in words[i]] for i in range(len(words))]
    titles = total_title + [author for author in display_authors]
    word_colours = total_colour + [author_colours[author] for author in display_authors]

    for i in range(len(freqs)): # draw a list for total and every author
        list_ax(ax[i],words[i],freqs[i],titles[i],word_colours[i])

    fig.suptitle(title)
    fig.tight_layout(w_pad=0.25)
    plt.show()

<h1>Execution functions</h1>
Execute functions to generate statistics, and create a GUI.

<h3>perform calculations to generate all message statistics</h3>

In [None]:
def perform_calculations():
    global content_sums
    content_sums = calculate_content_sums()
    global word_sums
    word_sums = calculate_word_sums()
    global messages_by_hour
    messages_by_hour = get_messages_by_hour()
    global occurrences
    occurrences = calculate_occurrences()
    global sentiment
    sentiment = calculate_sentiment()
    global word_ranking
    word_ranking = calculate_words()

<h3>draw all plots selected for display</h3>

In [None]:
plots = {"Content":plot_content,"Average Content":plot_average_content,"Messages by Hour":plot_messages_by_hour,"Occurrences":plot_occurrences,"Sentiment":plot_sentiment,"Words":plot_words}

def draw_plots():
    with out:
        display_plots = [plots[checkbox.description] for checkbox in plot_checkboxes if checkbox.value==True]
        if(len(display_plots)!=0): # draw every selected plot
            clear_output(wait=True)
            for plot in display_plots:
                plot()
        else: # clear plots and don't draw anything
            clear_output()

<h3>GUI and execution</h3>

In [None]:
def set_values(): # set values of variables used for creating plots to repsective widget states
    global display_authors
    display_authors = [checkbox.description for checkbox in author_checkboxes if checkbox.value==True]
    global display_media
    display_media = [checkbox.description for checkbox in media_checkboxes if checkbox.value==True]
    global display_content
    display_content = ["Message" if message_checkbox.value==True else None] + display_media
    global timescale
    timescale = timescale_buttons.value
    global date_range
    date_range = date_slider.value
    global search, search_type
    search = occurrences_box.value
    search_type = occurrences_type.value
    global word_type, display_amount
    word_type = word_type_box.value
    display_amount = int(word_display_box.value) if word_display_box.value!="" else 10


def apply_clicked(b): # execute plotting upon button click
    new_search = True if occurrences_box.value!=search else False # perform a new occurrences calculation if new value input
    new_words = True if word_type != word_type_box.value or date_range != date_slider.value else False # perform a new words calculation if different word type or date change
    set_values()
    if(new_search):
        global occurrences
        occurrences = calculate_occurrences()
    if(new_words):
        global word_ranking, total_word_ranking
        word_ranking = calculate_words()

    draw_plots()

apply = Button(description="Apply")
apply.on_click(apply_clicked)


plot_checkboxes = []
for plot in plots.keys():
    plot_checkboxes.append(Checkbox(value=False, description=plot, disabled=False, indent=False, layout=Layout(width='150px')))
plot_container = VBox([Label("Select Plots")]+plot_checkboxes)


author_checkboxes = []
for author in authors:
    author_checkboxes.append(Checkbox(value=True, description=author, disabled=False, indent=False, layout=Layout(width='150px')))
author_container = VBox([Label("Display Authors")]+author_checkboxes)


def change_timescale(change):
    global timescale
    timescale = timescale_buttons.value

    date_slider.min=0
    date_slider.max=len(dates[timescale])-1
    date_slider.value = (0, len(dates[timescale])-1)
    
    date_readout.value = slider_readout()

timescale_buttons = RadioButtons(options=["Days","Weeks","Months","Years"],layout=Layout(width='150px'))
timescale_buttons.observe(change_timescale, names="value")
timescale_container = VBox([Label("Timescale"),timescale_buttons])


media_checkboxes = []
for media in medias.values():
    media_checkboxes.append(Checkbox(value=True, description=media, disabled=False, indent=False, layout=Layout(width='150px')))
message_checkbox = Checkbox(value=True, description="Message", disabled=False, indent=False, layout=Layout(width='150px'))
content_container = VBox([Label("Content Types")]+[message_checkbox]+media_checkboxes)


occurrences_box = Text(value='', placeholder="Search Key", layout=Layout(width='150px'))
occurrences_type = RadioButtons(options=["Absolute","Ratio"])
occurrences_container = VBox([Label("Occurrences Settings")]+[occurrences_box,occurrences_type])

word_display_box = Text(value='', placeholder="Display Amount", layout=Layout(width='150px'))
word_type_box = RadioButtons(options=["Frequency","Representativeness"])
words_container = VBox([Label("Words Settings")]+[word_display_box,word_type_box])

settings_container = VBox([occurrences_container,words_container])


toolbar = HBox([apply,plot_container,author_container,timescale_container,content_container,settings_container])


def slider_readout():
    if(timescale=="Days" or timescale=="Weeks"): # date shown as YY-MM-DD
        return str(dates[timescale][date_slider.value[0]]) + " - " + str(dates[timescale][date_slider.value[1]])
    elif(timescale=="Months"):
        return str(dates[timescale][date_slider.value[0]])[:-3] + " - " + str(dates[timescale][date_slider.value[1]])[:-3]
    elif(timescale=="Years"):
        return str(dates[timescale][date_slider.value[0]])[:-6] + " - " + str(dates[timescale][date_slider.value[1]])[:-6]

def slider_change(*args):
    date_readout.value = slider_readout()

date_slider = IntRangeSlider( value=[0, len(dates["Days"])-1], min=0, max=len(dates["Days"])-1, continuous_update=True, readout=False, layout=Layout(width='1030px'))
date_slider.observe(slider_change, names='value')
date_readout = Label(str(dates["Days"][0]) + " - " + str(dates["Days"][-1]))
date_range_container = HBox([date_readout,date_slider])


out = Output()
display(toolbar,date_range_container,out)
set_values()
perform_calculations()
draw_plots()