In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

import pprint
import json

import dask
import dask.dataframe as dd
from dask.multiprocessing import get
from collections import Counter

import networkx as nx

pp = pprint.PrettyPrinter(indent=4)

In [None]:
people = pd.read_pickle("./data/intermediate/people.pkl")
teams = pd.read_pickle("./data/intermediate/teams.pkl")
channels = pd.read_pickle("./data/intermediate/channels.pkl")

In [None]:
people.head(3)

In [None]:
teams.head(3)

In [None]:
channels.head(3)

## Enrich channels
Calculate the number of active users in a channel and the number of messages in each channel

In [None]:
channel_active_users = {}
calc_channel_message_count = {}

for i, person in people.iterrows():
    channel_message_count = person["channel message counts"]

    for i, channel in enumerate(person["channels"]):
        if channel not in channel_active_users:
            channel_active_users[channel] = 0
            calc_channel_message_count[channel] = 0

        if channel_message_count[i] > 0:
            channel_active_users[channel] += 1
        calc_channel_message_count[channel] = channel_message_count[i]

In [None]:
def enrich_channels(row):
    row["active members"] = 0

    if row["label"] in channel_active_users:
        row["active members"] = channel_active_users[row["label"]]

    return row


channels = (
    dd.from_pandas(channels, npartitions=24)
    .map_partitions(lambda df: df.apply((lambda row: enrich_channels(row)), axis=1))
    .compute(scheduler="processes")
)

In [None]:
channels.to_pickle("./data/intermediate/channels_enriched.pkl")

In [None]:
channels.head(3)

## Acquaintance analysis 

In [None]:
channel_members = {}

for i, person in people.iterrows():
    for channel in person.channels:
        if channel in channel_members:
            channel_members[channel].append(person.userid)
        else:
            channel_members[channel] = [person.userid]

In [None]:
def add_channel_member_info(row):
    row["channel members"] = []

    if row["label"] in channel_members:
        row["channel members"] = channel_members[row["label"]]

    return row


channels = (
    dd.from_pandas(channels, npartitions=24)
    .map_partitions(lambda df: df.apply((lambda row: add_channel_member_info(row)), axis=1))
    .compute(scheduler="processes")
)

In [None]:
channels.head(5)

In [None]:
channels.to_pickle("./data/intermediate/channels_enriched.pkl")

In [None]:
user_channel_dict = {}

for i, person in people.iterrows():
    user_channel_dict[person["userid"]] = {}

    message_count_list = [message_count for message_count in person["channel message counts"]]

    for entry in zip(person["channels"], message_count_list):
        user_channel_dict[person["userid"]][entry[0]] = entry[1]

In [None]:
messages_per_channel = pd.Series(channels["message count"].values, index=channels.label).to_dict()

active_members_per_channel = pd.Series(
    channels["active members"].values, index=channels.label
).to_dict()

In [None]:
acquaintances = {}

for i, person in people.iterrows():

    acquaintances[person["userid"]] = {}
    print(person["userid"], end="\r")

    for channel in person["channels"]:
        channel_member_count = len(channel_members[channel])

        if channel_member_count < 2:
            continue

        channel_messages = 0
        active_channel_members = 0

        if channels[channels.label == channel].size > 0:
            channel_messages = messages_per_channel[channel]
            active_channel_members = active_members_per_channel[channel]

        acquaintance_list = channel_members[channel].copy()
        acquaintance_list.remove(person["userid"])

        for acquaintance in acquaintance_list:
            if acquaintance not in acquaintances[person["userid"]]:
                acquaintances[person["userid"]][acquaintance] = {"score": 0, "channels_shared": 0}

            if channel_messages > 0 and channel_member_count > 0 and active_channel_members > 0:
                acquaintances[person["userid"]][acquaintance]["score"] += (
                    (1 + (2 / channel_member_count))
                    * (user_channel_dict[acquaintance][channel] / channel_messages)
                    * (1 + (1 / active_channel_members))
                )

            acquaintances[person["userid"]][acquaintance]["channels_shared"] += 1


print(f"There are {len(acquaintances.keys())} acquaintances", end="\r")

In [None]:
for user in acquaintances.keys():
    for acquaintance in acquaintances[user].keys():
        acquaintances[user][acquaintance]["score"] = round(
            acquaintances[user][acquaintance]["score"], 8
        )

In [None]:
# Calculate user to acquaintances channel similarity
for user in acquaintances.keys():
    print(user, end="\r")
    user_channel_count = len(people[people["userid"] == user].channels.values)
    for acquaintance in acquaintances[user]:
        acquaintances[user][acquaintance]["channel similarity score"] = (
            acquaintances[user][acquaintance]["channels_shared"] / user_channel_count
        )

print(f"Done calculating CSS for all.", end="\r")

In [None]:
acquaintances_list = []

for user in acquaintances.keys():
    acquaintances_list.append(
        {
            "userid": user,
            "acquaintances": [acquaintance for acquaintance in acquaintances[user].keys()],
            "channels shared": [
                acquaintances[user][acquaintance]["channels_shared"]
                for acquaintance in acquaintances[user].keys()
            ],
            "channel similarity scores": [
                acquaintances[user][acquaintance]["channel similarity score"]
                for acquaintance in acquaintances[user].keys()
            ],
            "acquaintance likelihood scores": [
                acquaintances[user][acquaintance]["score"]
                for acquaintance in acquaintances[user].keys()
            ],
        }
    )

acquaintances_df = pd.DataFrame(acquaintances_list)

In [None]:
acquaintances_df.head(3)

In [None]:
acquaintances_df.to_pickle("./data/intermediate/acquaintances.pkl")

## Normalise the acquaintance likelihood scores

In [None]:
# Normalise ALS
max_als = 0

for i, row in acquaintances_df.iterrows():
    if len(row["acquaintance likelihood scores"]) and max_als < max(
        row["acquaintance likelihood scores"]
    ):
        max_als = max(row["acquaintance likelihood scores"])

print(f"The max ALS score is {max_als}")

In [None]:
def normalise_ALS(row):
    row["acquaintance likelihood scores"] = [score / max_als for score in row["acquaintance likelihood scores"]]
    return row


acquaintances_df = acquaintances_df.apply(normalise_ALS, axis=1)

In [None]:
max_norm_als = 0

for i, row in acquaintances_df.iterrows():
    if len(row["acquaintance likelihood scores"]) and max_norm_als < max(row["acquaintance likelihood scores"]):
        max_norm_als = max(row["acquaintance likelihood scores"])

print(f"The max norm ALS score is {max_norm_als}")

In [None]:
acquaintances_df.to_pickle("./data/intermediate/acquaintances.csv")

## Calculate the full acquaintance similarity score
Takes into account also the channel similarity score and normalizes everything

In [None]:
def calculate_acquaintance_similarity(row):
    css = np.array(row["channel similarity scores"])
    als = np.array(row["acquaintance likelihood scores"])
    row["user similarity scores"] = (1 + css) * als
    return row


acquaintances_df = acquaintances_df.apply(calculate_acquaintance_similarity, axis=1)

In [None]:
max_sim = 0

for i, row in acquaintances_df.iterrows():
    if len(row["user similarity scores"]) and max_sim < max(row["user similarity scores"]):
        max_sim = max(row["user similarity scores"])

print(f"The max similarity score is {max_sim}")

In [None]:
def normalise_sim_score(row):
    row["user similarity scores"] = [score / max_sim for score in row["user similarity scores"]]
    return row


acquaintances_df = acquaintances_df.apply(normalise_sim_score, axis=1)

In [None]:
max_sim = 0

for i, row in acquaintances_df.iterrows():
    if len(row["user similarity scores"]) and max_sim < max(row["user similarity scores"]):
        max_sim = max(row["user similarity scores"])

print(f"The max similarity score is {max_sim}")

In [None]:
acquaintances_df.head(3)

In [None]:
acquaintances_df.to_pickle("./data/intermediate/acquaintances.pkl")

## User channel visiblity
$S_c$ is the number of users in channel $c$ 

$V_{u_n, c} = \gamma * 2 \div S_c + (1-\gamma) * (\eta * M_{c,u_n} + (1-\eta) * MT_{c,u_n})$

## Likelihood of ego knowing its neighbours

likelihood of $u_e$ knowing $u_n$.

$B_{u_n, u_e}$ indicates if $u_n$ and $u_e$ work in the same building  
$O_{u_n, u_e}$ indicates if $u_n$ and $u_e$ belong to the same organisational unit  
$BS_{u_e}$ is the size of building $u_e$ belongs to  
$OS_{u_e}$ is the size of the organisationa unit $u_e$ belongs to  
$C_{u_n,u_e}$ is the set of channels shared between $u_e$ and $u_n$  
$N_{u_n,u_e}$ is the number of channels shared between $u_e$ and $u_n$. 
$N_{u_e}$ is the number of channels $u_e$ belongs to  

$V_{u_e, u_n} = \dfrac{1}{2} * \biggl(\epsilon * \dfrac{\sum_{c \in C_{u_n,u_e}} V_{u_n, c}}{N_{u_n,u_e}} + (1-\epsilon) * \dfrac{N_{u_n,u_e}}{N_{u_e}} + \dfrac{O_{u_n,u_e}}{OS_{u_e}} + \dfrac{B_{u_n,u_e}}{BS_{u_e}}\biggr)$

In [None]:
people = pd.read_pickle("./data/intermediate/people_enriched_1.6.2022.pkl")
teams = pd.read_pickle("./data/intermediate/teams_enriched.pkl")
channels = pd.read_pickle("./data/intermediate/channels_enriched_1.6.2022.pkl")
acquaintances = pd.read_pickle("./data/intermediate/acquaintances.pkl")

In [None]:
acquaintances.head(3)

In [None]:
people.head(3)

In [None]:
user_buildings = pd.Series(people["building"].values, index=people.userid).to_dict()
user_org_units = pd.Series(people["organisational unit"].values, index=people.userid).to_dict()

In [None]:
# Expand acquaintances with info on same buiildings and same org unit
def get_same_building_and_org_unit(row):
    user_building = user_buildings[row.userid]
    user_org_unit = user_org_units[row.userid]

    row["share building"] = [user_buildings[user] == user_building for user in row.acquaintances]
    row["share org unit"] = [user_org_units[user] == user_org_unit for user in row.acquaintances]
    return row


acquaintances = acquaintances.apply(get_same_building_and_org_unit, axis=1)

In [None]:
# calc full similarity score
def get_extended_sim_score(row):

    if len(row["user similarity scores"]) == 0:
        row["extended user similarity scores"] = (
            np.round((
                0.5 * np.array(row["share building"], dtype=bool)
                + 0.5 * np.array(row["share org unit"], dtype=bool)
            )
            / 2, 2)
        ).tolist()
    else:

        row["extended user similarity scores"] = (
            np.round((
                np.array(row["user similarity scores"])
                + 0.5 * np.array(row["share building"], dtype=bool)
                + 0.5 * np.array(row["share org unit"], dtype=bool)
            )
            / 2, 2)
        ).tolist()

    return row


acquaintances = acquaintances.apply(get_extended_sim_score, axis=1)

In [None]:
acquaintances.head(3)

In [None]:
all_user_similairty_values = []

for i, entry in acquaintances.iterrows():
    for score in entry['extended user similarity scores']:
        all_user_similairty_values.append(score)

In [None]:
max(all_user_similairty_values)

In [None]:
acquaintances.to_pickle("./data/final/acquaintances.pkl")