## Read First
🪧 This notebook is the same as `2_feature_engineering_users`but it calculates the acquaintance scores by only considering channels that have more than 5 and less than 75% of the max users.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

import pprint
import json

import dask
import dask.dataframe as dd
from dask.multiprocessing import get
from collections import Counter

import matplotlib.pyplot as plt

import networkx as nx

pp = pprint.PrettyPrinter(indent=4)

In [None]:
people = pd.read_pickle("./data/final/people.pkl")
teams = pd.read_pickle("./data/intermediate/teams.pkl")
channels = pd.read_pickle("./data/intermediate/channels_enriched_1.6.2022.pkl")

In [None]:
people.head(3)

In [None]:
teams.head(3)

In [None]:
channels.head(3)

## Acquaintance analysis 

## User channel visiblity
$S_c$ is the number of users in channel $c$ 

$V_{u_n, c} = \gamma * 2 \div S_c + (1-\gamma) * (\eta * M_{c,u_n} + (1-\eta) * MT_{c,u_n})$

## Likelihood of ego knowing its neighbours

likelihood of $u_e$ knowing $u_n$.

$B_{u_n, u_e}$ indicates if $u_n$ and $u_e$ work in the same building  
$O_{u_n, u_e}$ indicates if $u_n$ and $u_e$ belong to the same organisational unit  
$BS_{u_e}$ is the size of building $u_e$ belongs to  
$OS_{u_e}$ is the size of the organisationa unit $u_e$ belongs to  
$C_{u_n,u_e}$ is the set of channels shared between $u_e$ and $u_n$  
$N_{u_n,u_e}$ is the number of channels shared between $u_e$ and $u_n$. 
$N_{u_e}$ is the number of channels $u_e$ belongs to  

$V_{u_e, u_n} = \dfrac{1}{2} * \biggl(\epsilon * \dfrac{\sum_{c \in C_{u_n,u_e}} V_{u_n, c}}{N_{u_n,u_e}} + (1-\epsilon) * \dfrac{N_{u_n,u_e}}{N_{u_e}} + \dfrac{O_{u_n,u_e}}{OS_{u_e}} + \dfrac{B_{u_n,u_e}}{BS_{u_e}}\biggr)$

In [None]:
user_channel_dict = {}

for i, person in people.iterrows():
    user_channel_dict[person["userid"]] = {}

    message_count_list = [message_count for message_count in person["channel message counts"]]

    for entry in zip(person["channels"], message_count_list):
        user_channel_dict[person["userid"]][entry[0]] = entry[1]

In [None]:
messages_per_channel = pd.Series(
    channels["calc message count"].values, index=channels.label
).to_dict()

active_members_per_channel = pd.Series(
    channels["active members"].values, index=channels.label
).to_dict()

channel_members = pd.Series(channels["channel members"].values, index=channels.label).to_dict()

channel_member_counts = pd.Series(channels["member count"].values, index=channels.label).to_dict()

In [None]:
mention_factors = {}
message_factors = {}

for i, row in people.iterrows():
    mention_factors[row.userid] = {}
    message_factors[row.userid] = {}

    for i, channel in enumerate(row.channels):
        mention_factors[row.userid][channel] = row["channel mention factor"][i]
        message_factors[row.userid][channel] = row["channel message factor"][i]

In [None]:
acquaintances = {}

for i, ego in people.iterrows():

    acquaintances[ego.userid] = {}
    print(ego["userid"], end="\r")

    # Iterate over all channels of the ego
    for channel in ego["channels"]:
        if channel_member_counts[channel] > 500:
            continue

        channel_member_count = channel_member_counts[channel]

        channel_messages = 0
        active_channel_members = 0

        if channels[channels.label == channel].size > 0:
            channel_messages = messages_per_channel[channel]
            active_channel_members = active_members_per_channel[channel]
        acquaintance_list = channel_members[channel].copy()
        acquaintance_list.remove(ego.userid)

        # Iterate over neighbours
        for acquaintance in acquaintance_list:
            if acquaintance not in acquaintances[ego.userid]:
                acquaintances[ego.userid][acquaintance] = {"channels_shared": 0}

                for k in [0, 0.5, 1]:
                    for j in [0, 0.5, 1]:
                        acquaintances[ego.userid][acquaintance][f"score {k} {j}"] = 0

            for k in [0, 0.5, 1]:
                for j in [0, 0.5, 1]:
                    acquaintances[ego.userid][acquaintance][
                        f"score {k} {j}"
                    ] = j * 2 / channel_member_count + (1 - j) * (
                        k * mention_factors[acquaintance][channel]
                        + (1 - k) * message_factors[acquaintance][channel]
                    )

            acquaintances[ego.userid][acquaintance]["channels_shared"] += 1


print(f"There are {len(acquaintances.keys())} acquaintances", end="\r")

In [None]:
list(acquaintances["user_0"].keys())[0]

In [None]:
"user_0" in list(acquaintances["user_52"].keys())

In [None]:
# Calculate user to acquaintances channel similarity
for user in acquaintances.keys():
    print(user, end="\r")
    user_channel_count = len(people[people["userid"] == user].channels.values)
    for acquaintance in acquaintances[user]:
        acquaintances[user][acquaintance]["channel similarity score"] = (
            acquaintances[user][acquaintance]["channels_shared"] / user_channel_count
        )

print(f"Done calculating CSS for all.", end="\r")

In [None]:
for user in acquaintances.keys():
    print(user, end="\r")
    for acquaintance in acquaintances[user]:
        for k in [0, 0.5, 1]:
            for j in [0, 0.5, 1]:
                for l in [0, 0.5, 1]:
                    acquaintances[user][acquaintance][f"sim score simple {k} {j} {l}"] = (
                        l
                        * acquaintances[user][acquaintance][f"score {k} {j}"]
                        / acquaintances[user][acquaintance]["channels_shared"]
                        + (1 - l) * acquaintances[user][acquaintance]["channel similarity score"]
                    )

In [None]:
user_buildings = pd.Series(people["building"].values, index=people.userid).to_dict()
user_org_units = pd.Series(people["organisational unit"].values, index=people.userid).to_dict()

In [None]:
for user in acquaintances.keys():
    print(user, end="\r")
    for acquaintance in acquaintances[user]:
        acquaintances[user][acquaintance]["same building"] = (
            1 if user_buildings[user] == user_buildings[acquaintance] else 0
        )
        acquaintances[user][acquaintance]["same org unit"] = (
            1 if user_org_units[user] == user_org_units[acquaintance] else 0
        )

In [None]:
# Get org and building size
for user in acquaintances.keys():
    print(user, end="\r")
    acquaintances[user]["building size"] = people[people.userid == user][
        "building employee count"
    ].values[0]
    acquaintances[user]["org unit size"] = people[people.userid == user][
        "organisational unit employee count"
    ].values[0]

In [None]:
for user in acquaintances.keys():
    print(user, end="\r")
    for acquaintance in acquaintances[user]:
        for k in [0, 0.5, 1]:
            for j in [0, 0.5, 1]:
                for l in [0, 0.5, 1]:
                    if type(acquaintances[user][acquaintance]) is dict:
                        building_factor = 0
                        if (
                            acquaintances[user][acquaintance]["same building"] == 1
                            and acquaintances[user]["building size"] >= 2
                        ):
                            building_factor = 2 / acquaintances[user]["building size"]

                        org_unit_factor = 0
                        if (
                            acquaintances[user][acquaintance]["same org unit"] == 1
                            and acquaintances[user]["org unit size"] >= 2
                        ):
                            org_unit_factor = 2 / acquaintances[user]["org unit size"]

                        acquaintances[user][acquaintance][
                            f"sim score extended {k} {j} {l}"
                        ] = 0.5 * acquaintances[user][acquaintance][
                            f"sim score simple {k} {j} {l}"
                        ] + 0.5 * (
                            0.5 * building_factor + 0.5 * org_unit_factor
                        )

In [None]:
acquaintances_list = []

for user in acquaintances.keys():
    print(user, end="\r")

    user_acquaintances = [
        val
        for val in acquaintances[user].keys()
        if val != "building size" and val != "org unit size"
    ]

    user_dict = {
        "userid": user,
        "acquaintances": user_acquaintances,
        "channels shared": [
            acquaintances[user][acquaintance]["channels_shared"]
            for acquaintance in user_acquaintances
        ],
    }

    for k in [0, 0.5, 1]:
        for j in [0, 0.5, 1]:
            for l in [0, 0.5, 1]:
                user_dict[f"sim score extended {k} {j} {l}"] = [
                    round(acquaintances[user][acquaintance][f"sim score extended {k} {j} {l}"], 3)
                    for acquaintance in user_acquaintances
                ]

    acquaintances_list.append(user_dict)



In [None]:
acquaintances_df = pd.DataFrame(acquaintances_list)

In [None]:
acquaintances_df.head(10)

In [None]:
acquaintances_df.to_pickle("./data/final/acquaintances_cf.pkl")