In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.multiprocessing import get

In [None]:
acquaintances = pd.read_pickle("./data/final/acquaintances_cf.pkl")
people = pd.read_pickle("./data/final/people.pkl")

In [None]:
channel_ids = pd.read_pickle("./data/intermediate/channel_ids.pkl")
user_ids = pd.read_pickle("./data/intermediate/user_ids.pkl")

In [None]:
people.head(1)

In [None]:
acquaintances.head(1)

In [None]:
user_ids.head(1)

In [None]:
channel_ids.head(1)

#### Generate id dicts

In [None]:
user_ids_dict = pd.Series(user_ids["old_id"].values, index=user_ids.new_id).to_dict()

In [None]:
channel_ids_dict = pd.Series(channel_ids["old_id"].values, index=channel_ids.new_id).to_dict()

#### Convert user ids to original ids

In [None]:
def convert_userids(row):
    row["userid"] = user_ids_dict[row["userid"]]
    return row


people = people.apply(convert_userids, axis=1)
acquaintances = acquaintances.apply(convert_userids, axis=1)

In [None]:
people.head(1)

In [None]:
acquaintances.head(1)

In [None]:
people.columns

In [None]:
def get_masked_array(input_array, mask):
    return np.ma.MaskedArray.compressed(np.ma.MaskedArray(input_array, mask=~mask)).tolist()


def convert_channelids(row):
    print(row["userid"], end="\r")
    channels = row["channels"]

    if len(channels) == 0:
        return row

    mask = np.array([channel in channel_ids_dict for channel in channels])

    row["channels"] = [channel_ids_dict[channel] for channel in get_masked_array(channels, mask)]
    
    for j in [0,0.5,1]:
        for k in [0,0.5,1]:
            row[f"channel relevance {j} {k}"] = get_masked_array(row[f"channel relevance {j} {k}"], mask)

    return row


people = people.apply(convert_channelids, axis=1)

In [None]:
people.head(1)

In [None]:
df_values = {}

for j in [0,0.5,1]:
    for k in [0,0.5,1]:
        df_values[f"channel relevance {j} {k}"] = []



for i, row in people.iterrows():
    print(f"Now processing row {i}", end="\r")

    for c, channel in enumerate(row.channels):
        for j in [0,0.5,1]:
            for k in [0,0.5,1]:
                df_values[f"channel relevance {j} {k}"].append({
                    "userid": row.userid,
                    "channelid": channel,
                    "score": row[f"channel relevance {j} {k}"][c],
                })

In [None]:
pd.DataFrame(df_values["channel relevance 0.5 0.5"]).hist(column="score")

In [None]:
for j in [0,0.5,1]:
    for k in [0,0.5,1]:
        print(f"j: {j}, k: {k}", end="\r")
        pd.DataFrame(df_values[f"channel relevance {j} {k}"]).to_pickle(f"./data/final/user-channel-{j}-{k}.pkl")
        print("")

In [None]:
acquaintances.head(1)

In [None]:
df_values = {}

for j in [0,0.5,1]:
    for k in [0,0.5,1]:
        for l in [0,0.5,1]:
            df_values[f"sim score extended {l} {j} {k}"] = []

for i, row in acquaintances.iterrows():
    print(f"Now processing row {i}", end="\r")

    if len(row.acquaintances) == 0:
        continue

    acquaintances_list = row.acquaintances[0]

    for a, acquaintance in enumerate(row.acquaintances):
        for j in [0,0.5,1]:
            for k in [0,0.5,1]:
                for l in [0,0.5,1]:
                    df_values[f"sim score extended {l} {j} {k}"].append(
                        {
                            "userid": row.userid,
                            "neighbourid": user_ids_dict[acquaintance],
                            "score": row[f"sim score extended {l} {j} {k}"][a],
                        }
                    )

In [None]:
for j in [0,0.5,1]:
    for k in [0,0.5,1]:
        for l in [0,0.5,1]:
            print(f"j: {j}, k: {k}, l: {l}")
            pd.DataFrame(df_values[f"sim score extended {l} {j} {k}"]).to_pickle(f'./data/final/user-user-{j}-{k}-{l}.pkl')

### Fancy channel relevance (ego - neighbors - channels)

In [None]:
user_user_conf = [
    {"j": 0, "k": 1, "l": 0.5},
    {"j": 0, "k": 1, "l": 0},
    {"j": 0.5, "k": 1, "l": 1},
    {"j": 0, "k": 0, "l": 0},
    {"j": 0, "k": 0, "l": 1}
]


pd.read_pickle(f"./data/final/ego-channel-{0}-{1}-with-user-conf-{0}-{1}-{0.5}.pkl")

    

In [None]:
userchannel = pd.read_pickle(f"./data/final/user-channel-{0}-{0}.pkl")
userchannel.head(5)

In [None]:
userchannel[userchannel.userid == "50f294244cd7b76fbeb44959175a96a5"]