In [None]:
import pandas as pd
import numpy as np
import pprint
import json

from collections import Counter
from math import log

import dask
import dask.dataframe as dd
from dask.multiprocessing import get

pp = pprint.PrettyPrinter(indent=4)

In [None]:
people = pd.read_pickle("./data/intermediate/people.pkl")
teams = pd.read_pickle("./data/intermediate/teams.pkl")
channels = pd.read_pickle("./data/intermediate/channels_enriched.pkl")

## Channel relevance score
Calculate how relevant are certain channels for users
The idea would be to go from user -> neighbour -> channel

In [None]:
channels.head(3)

In [None]:
people.head(3)

In [None]:
teams.head(3)

In [None]:
teams["channel count"] = teams.apply(lambda row: len(row.channels), axis=1)

In [None]:
teams.to_pickle("./data/intermediate/teams_enriched.pkl")

## Team membership factor / Team boost score 
$NT_{t,u}$ is the number of team $t$ channels that user $u$ belongs to  
$NT_t$ is the number of all channels in team $t$   
the largest number of channels in a team   

$TM_{u,c} = \dfrac{1}{2} * \biggl(\dfrac{NT_{t,u}}{NT_t} + \dfrac{NT_t}{max(NT)}\biggr)$

In [None]:
channel_team_dict = {}

for i, team in teams.iterrows():
    channel_team_dict = channel_team_dict | {channel: team["label"] for channel in team["channels"]}

In [None]:
# Get max channel member count
max_channel_members = max(channels["member count"].to_list())

In [None]:
max_team_channels = max(teams["channel count"].to_list())

In [None]:
member_count_per_channel = pd.Series(
    channels["member count"].values, index=channels.label
).to_dict()

In [None]:
def get_team_boost(row):
    print(row["userid"], end="\r")

    user_channels = row["channels"]

    user_teams = [
        channel_team_dict[channel] if channel in channel_team_dict else "no team"
        for channel in user_channels
    ]
    # each team should have a score from 0 to 1 depending on the percentage of all channels a user belongs to from that team
    team_boost = {
        team_freq[0]: round(
            team_freq[1] / teams[teams["label"] == team_freq[0]]["channel count"].values[0], 3
        )
        for team_freq in Counter(user_teams).most_common()
        if team_freq[0] in teams["label"].unique()
    }

    row["team boost"] = team_boost

    row["team boost per channel"] = [
        round(
            (
                team_boost[channel_team_dict[channel]]
                + teams[teams["label"] == channel_team_dict[channel]]["channel count"].values[0]
                / max_team_channels
            )
            / 2,
            3,
        )
        if channel in channel_team_dict
        and channel_team_dict[channel] in team_boost
        and member_count_per_channel[channel] > 1
        else 0
        for channel in user_channels
    ]

    return row


people = (
    dd.from_pandas(people, npartitions=24)
    .map_partitions(lambda df: df.apply((lambda row: get_team_boost(row)), axis=1))
    .compute(scheduler="processes")
)

In [None]:
people.head(3)

In [None]:
max_boost = 0

for i, person in people.iterrows():
    for boost in person["team boost per channel"]:
        if boost > max_boost:
            max_boost = boost

print(f"Max team boost per channel is {max_boost}")

In [None]:
# Calculate mention count per channel and popular user per channel
channel_mentions = {}

for i, user in people.iterrows():
    print(f"user {i}", end="\r")

    user_channels = user["channels"]

    if len(user_channels) == 0:
        continue

    channel_mention_count = [mention for mention in user["channel mention counts"]]

    for channel in [entry for entry in zip(user_channels, channel_mention_count) if entry[1] > 0]:
        if channel[0] in channel_mentions:
            channel_mentions[channel[0]]["mention_count"] += channel[1]
            channel_mentions[channel[0]]["popular_users"] += 1
        else:
            channel_mentions[channel[0]] = {"mention_count": channel[1], "popular_users": 1}

In [None]:
# Calculate message count per channel and active user per channel
channel_messages = {}

for i, user in people.iterrows():
    print(f"user {i}", end="\r")

    user_channels = user["channels"]

    if len(user_channels) == 0:
        continue

    channel_message_count = [message for message in user["channel message counts"]]

    for channel in [entry for entry in zip(user_channels, channel_message_count) if entry[1] > 0]:
        if channel[0] in channel_messages:
            channel_messages[channel[0]]["message_count"] += channel[1]
            channel_messages[channel[0]]["active_users"] += 1
        else:
            channel_messages[channel[0]] = {"message_count": channel[1], "active_users": 1}

In [None]:
def enrich_channels(row):
    row["popular members"] = 0
    row["mention count"] = 0
    row["calc message count"] = 0
    row["calc active user count"] = 0

    if row.label in channel_mentions:
        row["popular members"] = channel_mentions[row["label"]]["popular_users"]
        row["mention count"] = channel_mentions[row["label"]]["mention_count"]

    if row.label in channel_messages:
        row["calc message count"] = channel_messages[row["label"]]["message_count"]
        row["calc active user count"] = channel_messages[row["label"]]["active_users"]

    return row 


channels = (
    dd.from_pandas(channels, npartitions=24)
    .map_partitions(lambda df: df.apply((lambda row: enrich_channels(row)), axis=1))
    .compute(scheduler="processes")
)

In [None]:
channels.to_pickle("./data/intermediate/channels_enriched_1.6.2022.pkl")

## Mention factor / Mention boost

$NMT_{u,c}$ is the number of mentions $u$ received in $c$  
$NMT_c$ is the number of all mentions in $c$  
$P_c$ is the number of users that received at least one mention in $c$  
$S_c$ is the number of all users in $c$  

$MT_{u,c} = \dfrac{1}{2} * \biggl(\dfrac{NMT_{u,c}}{NMT_{c}} + 1 - \dfrac{P_{c}}{S_{c}}\biggl)$

In [None]:
mentions_per_channel = pd.Series(channels["mention count"].values, index=channels.label).to_dict()

In [None]:
mentioned_users_per_channel = pd.Series(
    channels["popular members"].values, index=channels.label
).to_dict()

In [None]:
user_per_channel = pd.Series(channels["member count"].values, index=channels.label).to_dict()

In [None]:
def get_mention_boost(row):

    user_channels = row["channels"]
    user_mentions_per_channel = np.array(row["channel mention counts"], dtype=float)

    all_mentions_per_channel = np.array(
        [mentions_per_channel[channel] for channel in user_channels], dtype=float
    )
    popular_users_per_channel = np.array(
        [mentioned_users_per_channel[channel] for channel in user_channels], dtype=float
    )
    users_per_channel = np.array(
        [user_per_channel[channel] for channel in user_channels], dtype=float
    )

    channel_mention_factor = (
        (
            np.divide(
                user_mentions_per_channel,
                all_mentions_per_channel,
                out=np.zeros_like(user_mentions_per_channel),
                where=all_mentions_per_channel != 0,
            )
        )
        + 1 - (
            np.divide(
                popular_users_per_channel,
                users_per_channel,
                out=np.zeros_like(popular_users_per_channel),
                where=users_per_channel != 0,
            )
        )
    ) / 2

    # channel_mention_factor[channel_mention_factor > 1] = 1

    row["channel mention factor"] = np.round(channel_mention_factor, 3).tolist()

    return row


people = people.apply(get_mention_boost, axis=1)

In [None]:
people.head(3)

In [None]:
channels.head(3)

## Message factor / Message boost
$NM_{u,c}$ is the number of messages $u$ posted in $c$  
$NM_c$ is the number of all messages in $c$  
$A_c$ is the number of users that wrote at least one message in $c$  
$S_c$ is the number of all users in $c$

$M_{u,c} = \dfrac{1}{2} * \biggl(\dfrac{NM_{u,c}}{NM_{c}} + 1 - \dfrac{A_{c}}{S_{c}}\biggr)$

In [None]:
messages_per_channel = pd.Series(
    channels["calc message count"].values, index=channels.label
).to_dict()

In [None]:
active_users_per_channel = pd.Series(
    channels["calc active user count"].values, index=channels.label
).to_dict()

In [None]:
def get_message_boost(row):

    user_channels = row["channels"]
    user_messages_per_channel = np.array(row["channel message counts"], dtype=float)

    all_messages_per_channel = np.array(
        [messages_per_channel[channel] for channel in user_channels], dtype=float
    )

    active_users_per_user_channel = np.array(
        [active_users_per_channel[channel] for channel in user_channels], dtype=float
    )

    users_per_channel = np.array(
        [user_per_channel[channel] for channel in user_channels], dtype=float
    )

    channel_message_factor = (
        np.divide(
            user_messages_per_channel,
            all_messages_per_channel,
            out=np.zeros_like(user_messages_per_channel),
            where=all_messages_per_channel != 0,
        )
        + 1 - np.divide(
            active_users_per_user_channel,
            users_per_channel,
            out=np.zeros_like(active_users_per_user_channel),
            where=users_per_channel != 0,
        )
    ) / 2

    # channel_message_factor[channel_message_factor > 1] = 1

    row["channel message factor"] = np.round(channel_message_factor, 3).tolist()
    return row


people = people.apply(get_message_boost, axis=1)

In [None]:
factors = []

for i, row in people.iterrows():
    for factor in row["channel message factor"]:
        factors.append(factor)

max(factors)

In [None]:
people.head(3)

In [None]:
people.to_pickle("./data/intermediate/people_enriched_1.6.2022.pkl")

## Channel relevance score

$C_{u,c} = \alpha * TM_{u,c} + (1-\alpha) * (\beta * M_{u,c} + (1-\beta) * MT_{u,c})$

In [None]:
def calculate_cr(row, j, k):
    return np.round(
        j * np.array(row["team boost per channel"])
        + (1 - j)
        * (
            k * np.array(row["channel mention factor"])
            + (1 - k) * np.array(row["channel message factor"])
        ),
        3,
    ).tolist()

In [None]:
def combine_mention_message(row):
    
    for j in [0, 0.25, 0.5, 0.75, 1]:
        for k in [0, 0.25, 0.5, 0.75, 1]:
            row[f"channel relevance {j} {k}"] = calculate_cr(row, j, k)
    return row


people = people.apply(combine_mention_message, axis=1)

In [None]:
people.head(3)

## Normalise all values

In [None]:
max_values = {}

for j in [0, 0.25, 0.5, 0.75, 1]:
    for k in [0, 0.25, 0.5, 0.75, 1]:
        max_values[f"channel relevance {j} {k}"] = 0

In [None]:
for i, person in people.iterrows():
    for j in [0, 0.25, 0.5, 0.75, 1]:
        for k in [0, 0.25, 0.5, 0.75, 1]:
            for value in person[f"channel relevance {j} {k}"]:
                if value > max_values[f"channel relevance {j} {k}"]:
                    max_values[f"channel relevance {j} {k}"] = value

In [None]:
for j in [0, 0.25, 0.5, 0.75, 1]:
    for k in [0, 0.25, 0.5, 0.75, 1]:
        print(max_values[f"channel relevance {j} {k}"])

In [None]:
people.head(3)

In [None]:
people.to_pickle("./data/final/people.pkl")

In [None]:
people.columns.to_list()

In [None]:
factor_df = []

for i, row in people.iterrows():
    for factor in row[f"channel relevance 1 1"]:
        factor_df.append({"user": row.userid, "score": factor})

pd.DataFrame(factor_df).hist(column="score")