In [None]:
import os
import random
import pandas as pd

from cdhf.data import Data
from tqdm.notebook import tqdm as log_progress
from utils.helpers import execute, save_pckl, create_init_dataframe, collaborative_filtering_cluster
from utils.power_functions import PowerFuncScore

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
random.seed(42)
random_sample = random.sample(range(10, 300), 1)
data = Data("../input/mmdata.json")
data.load_all()

In [None]:
df = create_init_dataframe(data).drop(columns=["score"])

## Process All Cluster with Complex Metrics

In [None]:


clustering_path = "Processed/Clustering"
cluser_plks = [f for f in os.listdir(clustering_path) if os.path.isfile(os.path.join(clustering_path, f))]
cluser_plks = [cpks for cpks in cluser_plks if cpks.startswith('clusters-0.5') or cpks.startswith('clusters-1-')]

feature_path = "Processed/UserChannel"
feature_files = [f for f in os.listdir(feature_path) if os.path.isfile(os.path.join(feature_path, f))]
power_function = PowerFuncScore()

feature_evals = {}
for feature_file in log_progress(feature_files):                            
    feature_df = pd.read_pickle(os.path.join(feature_path, feature_file)).rename(columns={"userid":"user_id", "channelid":"channel_id"})    
    cluster_evals = {}
    for cluser_plk in log_progress(cluser_plks):
        df_cluster  = pd.read_pickle(os.path.join(clustering_path, cluser_plk))   
        cluster_evals[cluser_plk] = pd.DataFrame()
        for ix, row in log_progress(df_cluster.iterrows(), total=df_cluster.shape[0]):            
            user_ids = row["nodes"]
            cluster_user_df = df[df["user_id"].isin(user_ids)]            
            rec_df = pd.merge(cluster_user_df, feature_df, on=['user_id','channel_id'], how="outer").fillna(0)    
            if(rec_df.empty):
                continue
            
            rec_df["score"] = power_function.calculate(rec_df["msg_count"], rec_df["score"])                                    
            cluster_evals[cluser_plk] = pd.concat([execute(rec_df[["user_id", "channel_id", "score", "u_id", "c_id"]], random_sample) , cluster_evals[cluser_plk]])         
    feature_evals[feature_file] = cluster_evals    


for f_key in feature_evals:         
    for c_key in feature_evals[f_key]:
        path = f"Evaulation/Complex/F/{f_key}/{c_key}"          
        save_pckl(path, feature_evals[f_key][c_key])     

In [None]:

for f_key in feature_evals:         
    for c_key in feature_evals[f_key]:
        path = f"Evaulation/Complex/F/{f_key}/{c_key}"          
        save_pckl(path, feature_evals[f_key][c_key])    

## Process All User-Channels with Complex Metrics

In [None]:
df_evals = pd.DataFrame()

feature_evals = {}

feature_path = "Processed/UserChannel"
feature_files = [f for f in os.listdir(feature_path) if os.path.isfile(os.path.join(feature_path, f))]
power_function = PowerFuncScore()

for feature_file in log_progress(feature_files): 
    try:

        feature_df = pd.read_pickle(os.path.join(feature_path, feature_file)).rename(columns={"userid":"user_id", "channelid":"channel_id"})    

        rec_df = pd.merge(df, feature_df, on=['user_id','channel_id'], how="outer").fillna(0)    
        rec_df["score"] = power_function.calculate(rec_df["msg_count"], rec_df["score"])
        feature_evals[feature_file] = execute(rec_df[["user_id", "channel_id", "score", "u_id", "c_id"]], random_sample)            
        path = f"Evaulation/Complex/P/{feature_file}"  
        save_pckl(path, feature_evals[feature_file]) 
    except:
        display(f"failed for {feature_file}")

for key in feature_evals:            
    path = f"Evaulation/Complex/P/{key}"  
    save_pckl(path, feature_evals[key])         

## Process All Generated Clusters

In [None]:
cluster_evals = {}

clustering_path = "Processed/Clustering"
cluser_plks = [f for f in os.listdir(clustering_path) if os.path.isfile(os.path.join(clustering_path, f))]
cluser_plks = [cpks for cpks in cluser_plks if cpks.startswith('clusters-0.5') or cpks.startswith('clusters-1-')]

for cluser_plk in log_progress(cluser_plks):
    cluster_evals[cluser_plk] =  pd.DataFrame()
    df_cluster  = pd.read_pickle(os.path.join(clustering_path, cluser_plk))   
    result = df_cluster['nodes'].apply(collaborative_filtering_cluster, args=(df, random_sample, ))    
    for ix, val in result.items():    
        if(val is None):
            continue
        cluster_evals[cluser_plk] = pd.concat([val , cluster_evals[cluser_plk]])       

for key in cluster_evals:            
    path = f"Evaulation/Clustering/{key}"  
    save_pckl(path, cluster_evals[key])  