In [None]:
import pandas as pd
import numpy as np

from cdhf import Data

In [None]:
test = pd.read_pickle("./data/final/people.pkl")

In [None]:
data = Data("data/raw/mmdata.json")
data.load_all()

## Extract users to a separate dataframe

In [None]:
mattermost = pd.DataFrame(
    [
        {
            "userid": userid,
            "building": data.users[userid].building or "no building",
            "organisational unit": data.users[userid].org_unit or "no unit",
        }
        for userid in list(data.users.keys())
    ]
)

In [None]:
mattermost.head(3)

## Extract teams for each user

In [None]:
user_teams = {userid: {"teams": [], "leaving times": []} for userid in mattermost.userid.to_list()}

for member in data.team_members:
    user_teams[member.user_id]["teams"].append(member.team_id)
    user_teams[member.user_id]["leaving times"].append(member.delete_at or 0)

In [None]:
def add_team_info(row):
    row["teams"] = user_teams[row.userid]["teams"]
    row["team leaving times"] = user_teams[row.userid]["leaving times"]
    return row


mattermost = mattermost.apply(add_team_info, axis=1)

In [None]:
mattermost.head(3)

## Extract channels for each user

In [None]:
user_channels = {
    userid: {"channels": [], "mention counts": [], "message counts": []}
    for userid in mattermost.userid.to_list()
}

for member in data.channel_members:
    user_channels[member.user_id]["channels"].append(member.channel_id)
    user_channels[member.user_id]["mention counts"].append(member.mention_count)
    user_channels[member.user_id]["message counts"].append(member.msg_count)

In [None]:
def add_channel_info(row):
    row["channels"] = user_channels[row.userid]["channels"]
    row["channel mention counts"] = user_channels[row.userid]["mention counts"]
    row["channel message counts"] = user_channels[row.userid]["message counts"]
    return row


mattermost = mattermost.apply(add_channel_info, axis=1)

In [None]:
mattermost.head(3)

## Extract building user counts

Since there are employees which left CERN but were not removed from the CERN system there are buildings that have more employees assigned than there would realistically be in the building 

In [None]:
building_user_count = {}

for building in data.building_members.keys():
    building_user_count[building] = len(data.building_members[building])

In [None]:
def add_building_employee_count(row):
    row["building employee count"] = 0

    if row.building != "no building":
        row["building employee count"] = building_user_count[row.building]

    return row


mattermost = mattermost.apply(add_building_employee_count, axis=1)

In [None]:
mattermost.head(3)

## Extract organisational unit user count

In [None]:
organisational_unit_user_count = {}

for unit in data.org_unit_members.keys():
    organisational_unit_user_count[unit] = len(data.org_unit_members[unit])

In [None]:
def add_organisational_unit_employee_count(row):
    row["organisational unit employee count"] = 0

    if row["organisational unit"] != "no unit":
        row["organisational unit employee count"] = organisational_unit_user_count[
            row["organisational unit"]
        ]

    return row


mattermost = mattermost.apply(add_organisational_unit_employee_count, axis=1)

In [None]:
mattermost.head(3)

## Extract employee type (internal or external)

In [None]:
def extract_employee_type(row):
    row["employee type"] = "internal"

    if row.building == "no building" or row["organisational unit"] == "no unit":
        row["employee type"] = "external"

    return row


mattermost = mattermost.apply(extract_employee_type, axis=1)

In [None]:
mattermost.head(3)

## Change ids format 

Change user ids to format `user_{num}`  
Change building ids to format `building_{num}`  
Change organisational unit ids to format `unit_{num}`  
Change team ids to format `team_{num}`  
Change channel ids to format `channel_{num}`

In [None]:
def transform_userid(row):
    row.userid = f"user_{row.userid}"
    return row


def get_id_dictionary(row):
    return [row["userid"], f"user_{row['index']}"]


mattermost = mattermost.reset_index()
user_id_dictionary = mattermost.apply(get_id_dictionary, axis=1)
mattermost = mattermost.drop(columns=["userid"]).rename(columns={"index": "userid"})
mattermost = mattermost.apply(transform_userid, axis=1)

In [None]:
user_id_dict = {element[1]: element[0] for element in user_id_dictionary.to_list()}

In [None]:
mattermost.head(3)

In [None]:
building_string_ids = {}

for i, building in enumerate(mattermost.building.unique()):
    building_string_ids[building] = f"building_{i}"

    if building == "no building":
        building_string_ids[building] = "no building"


def transform_building(row):
    row.building = building_string_ids[row.building]
    return row


mattermost = mattermost.apply(transform_building, axis=1)

In [None]:
mattermost.head(3)

In [None]:
organisational_unit_string_ids = {}

for i, unit in enumerate(mattermost["organisational unit"].unique()):
    organisational_unit_string_ids[unit] = "no unit"

    if unit != "no unit":
        organisational_unit_string_ids[unit] = f"unit_{i}"


def transform_organisational_unit(row):
    row["organisational unit"] = organisational_unit_string_ids[row["organisational unit"]]
    return row


mattermost = mattermost.apply(transform_organisational_unit, axis=1)

In [None]:
mattermost.head(3)

In [None]:
team_details = {}

for i, team in enumerate(data.teams):
    team_details[team.team_id] = {
        "label": f"team_{i}",
        "channels": team.channels,
        "restrictions": [],
    }

    if team.invite_only == True:
        team_details[team.team_id]["restrictions"].append("Invite only")

    if team.email_domain_restricted == True:
        team_details[team.team_id]["restrictions"].append("Email domain restricted")

In [None]:
channel_details = {}

channel_id_counter = 0

for i, channel in enumerate(data.channels):
    channel_details[channel.channel_id] = {
        "label": f"channel_{i}",
        "creator": channel.creator_id,
        "message count": channel.total_msg_count,
        "post count": channel.post_count,
        "reaction count": channel.reactions_count,
        "member count": len(channel.channel_members),
        "channel type": "public",
    }

    channel_id_counter = i + 1
print(f"There are {channel_id_counter} channels")

In [None]:
for team_id in team_details.keys():
    team_details[team_id]["channels"] = [
        channel_details[channel.channel_id]["label"]
        for channel in team_details[team_id]["channels"]
    ]

In [None]:
user_id_dictionary = {entry[0]: entry[1] for entry in user_id_dictionary}

In [None]:
for channel in channel_details.keys():
    channel_details[channel]["creator"] = "unknown creator"

    if channel_details[channel]["creator"] in user_id_dictionary:
        channel_details[channel]["creator"] = user_id_dictionary[
            channel_details[channel]["creator"]
        ]

In [None]:
def change_team_ids(row):
    row["teams"] = [team_details[team]["label"] for team in row["teams"]]
    return row


mattermost = mattermost.apply(change_team_ids, axis=1)

In [None]:
mattermost.head(3)

## Get private channels and get members of private channels

In [None]:
all_channels = np.unique(
    np.array(
        [
            channel
            for user_channels in list(mattermost.apply(lambda row: row.channels, axis=1))
            for channel in user_channels
        ]
    )
).tolist()

print(
    f"There are {len(all_channels)} channels in mattermost from which {len(channel_details)} are public and {len(all_channels) - len(channel_details)} are private."
)

In [None]:
# Extract user Channels
for channel in np.unique(
    np.array(
        [
            channel
            for user_channels in list(mattermost.apply(lambda row: row.channels, axis=1))
            for channel in user_channels
        ]
    )
):
    if channel not in channel_details:
        channel_details[channel] = {
            "label": f"channel_{channel_id_counter}",
            "creator": "unknown creator",
            "message count": 0,
            "post count": 0,
            "reaction count": 0,
            "member count": 0,
            "channel type": "private",
        }
        channel_id_counter = channel_id_counter + 1

## Add private channels

In [None]:
for i, row in mattermost.iterrows():
    for channel in row["channels"]:
        if channel_details[channel]["channel type"] == "private":
            channel_details[channel]["member count"] = channel_details[channel]["member count"] + 1

## Modify channel ids 

In [None]:
def change_channel_ids(row):
    row["channels"] = [channel_details[channel]["label"] for channel in row["channels"]]
    return row


mattermost = mattermost.apply(change_channel_ids, axis=1)

In [None]:
mattermost.head(3)

## Save to files

In [None]:
mattermost.to_pickle("./data/intermediate/people.pkl")

In [None]:
channel_list = [channel_details[channel_id] for channel_id in channel_details.keys()]
pd.DataFrame(channel_list).to_pickle("./data/intermediate/channels.pkl")

In [None]:
team_list = [team_details[team_id] for team_id in team_details.keys()]
pd.DataFrame(team_list).to_pickle("./data/intermediate/teams.pkl")

## Exporting ID dicts
These are needed later to reverse the ids

In [None]:
pd.DataFrame(
    [{"new_id": key, "old_id": user_id_dict[key]} for key in user_id_dict.keys()]
).to_pickle("./data/intermediate/user_ids.pkl")

In [None]:
pd.DataFrame(
    [
        {"new_id": channel_details[key]["label"], "old_id": key}
        for key in channel_details.keys()
        if channel_details[key]["channel type"] == "public"
    ]
).to_pickle("./data/intermediate/channel_ids.pkl")

In [None]:
print(f"There are {len(channel_list)} channels.")
print(f"There are {len(team_list)} teams.")
print(f"There are {len(mattermost.userid.to_list())} users")
print(f"There are {len(mattermost.building.unique())} buildings")
print(f"There are {len(mattermost['organisational unit'].unique())} organisational units")