In [None]:
import os
import math
import random
import implicit
import pandas as pd
import numpy as np
import scipy.sparse as sparse

from cdhf.data import Data
from implicit import evaluation
from tqdm.notebook import tqdm as log_progress

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

random.seed(42)
random_sample = random.sample(range(10, 300), 1)
data = Data("../input/mmdata.json")
data.load_all()

In [None]:
df = pd.DataFrame.from_records([vars(cm) for cm in data.channel_members])
df["index"] = df["channel_id"] + "-" + df["user_id"]
df.set_index('index', inplace=True)

df_grouped_users = df.groupby(["channel_id"]).count()
allowed_channels = df_grouped_users[df_grouped_users["user_id"] > 5].index.array
df = df[df["channel_id"].isin(allowed_channels)]
df['u_id'] = df['user_id'].astype("category").cat.codes
df['c_id'] = df['channel_id'].astype("category").cat.codes

In [None]:
class MeasureOfConfidence():
    def calculate(self):
        pass

    def name(self):
        pass

    def preprocess_cui(self, cui, df):
        pass

class BinaryMeasureOfConfidence(MeasureOfConfidence):
    def calculate(self, r_ui, alpha):
        return 1

    def name(self):
        return "Binary Measure < 1 >"

class SimpleMeasureOfConfidence(MeasureOfConfidence):
    def calculate(self, r_ui, alpha):
        return 1 + alpha*r_ui

    def name(self):
        return "Simple Measure < 1 + alpha*r_ui >"

class LogMeasureOfConfidence(MeasureOfConfidence):
    def calculate(self, r_ui, alpha):
        return 1 + alpha*(math.log(1+(r_ui/0.0001)))

    def name(self):
        return "Log Measure < 1 + alpha*log(1 + r_ui/epsilon) >"

class EvalResults():

    def __init__(self, name, confidence, metrics, factors, alpha, iterations):
        self.name = name
        self.confidence = confidence
        self.metrics = metrics
        self.factors = factors
        self.alpha = alpha
        self.iterations = iterations

In [None]:
class BasePowerFunction():
    def name():
        pass

    def calculate():
        pass


class SimpleLScore(BasePowerFunction):
    def name(self):
        return "Simple Score MSG Count"

    def calculate(self, s_l, s_r):
        return s_l

class SimpleRScore(BasePowerFunction):
    def name(self):
        return "Simple Score Feature"

    def calculate(self, s_l, s_r):
        return s_r

class PowerFuncScore(BasePowerFunction):
    def name(self):
        return "MSG COUNT To The Power of Score Feature"

    def calculate(self, s_l, s_r):
        return s_l.pow(1 + s_r)



power_functions = [SimpleLScore(), SimpleRScore(), PowerFuncScore()]

In [None]:

def calculateSparsity(df):
    users = list(np.sort(df.u_id.unique()))
    channels = list(np.sort(df.c_id.unique()))
    interactions = list(df.score)

    sparsity = (1 - (len(interactions)/(len(users)*len(channels))))*100

    user_sparsity_df = df.copy()
    user_sparsity_df = user_sparsity_df.groupby(["u_id"]).count()
    user_sparsity_max = user_sparsity_df.c_id.max()
    user_sparsity_df['USS'] = (1 - (user_sparsity_df["c_id"]/user_sparsity_max)) * 100

    item_sparsity_df = df.copy()
    item_sparsity_df = item_sparsity_df.groupby(["c_id"]).count()
    item_rating_max = item_sparsity_df.u_id.max()
    item_sparsity_df['ISS'] = (1 - (item_sparsity_df["u_id"]/item_rating_max)) * 100

    return sparsity, user_sparsity_df['USS'].mean(), item_sparsity_df['ISS'].mean(), len(users), len(channels)


In [None]:

def execute(df):
    sparsity, USS, ISS, users, channels = calculateSparsity(df)
    confidence_measures = [SimpleMeasureOfConfidence(), BinaryMeasureOfConfidence(), LogMeasureOfConfidence()]
    alpha_values = [40]
    factor_options = [150]
    iteration_opttions =[25] 

    evaluation_results = []

    for alpha_val in alpha_values:
        for confidence_measure in confidence_measures:
            c_ui = df['score'].astype(float).copy()
            c_ui.apply(confidence_measure.calculate, args= (alpha_val,))
            
            sparse_item_user = sparse.csr_matrix(( c_ui, (df['c_id'].astype(int), df['u_id'].astype(int))))
            # sparse_user_item = sparse.csr_matrix(( c_ui, (df['u_id'], df['c_id'])))
            for randomInt in random_sample:
                data_train, data_test = evaluation.train_test_split(sparse_item_user, 0.25, randomInt)

                for factors in factor_options:
                    for iterations in iteration_opttions:
                        regularization = 0.1                
                        models = []        
                        
                        models.append(implicit.als.AlternatingLeastSquares(num_threads = 4,  factors=factors, regularization=regularization, iterations=iterations))    
                        models.append(implicit.cpu.bpr.BayesianPersonalizedRanking(num_threads = 4, factors=factors, iterations=iterations))
                        models.append(implicit.cpu.lmf.LogisticMatrixFactorization(num_threads = 4))                    
                        models.append(implicit.nearest_neighbours.ItemItemRecommender(num_threads = 4, K=20))
            
                        for model in models:
                            try:
                                model.fit(data_train, show_progress=False)   
                                ranking = implicit.evaluation.ranking_metrics_at_k(model, data_train, data_test, K=10, show_progress=False)
                                evalResult = EvalResults(name = model.__class__, confidence = confidence_measure.name(), metrics = ranking, factors = factors, alpha = alpha_val, iterations = iterations )                    
                                evaluation_results.append(evalResult)                                       
                            except BaseException as e:
                                print(f'Failed when processing {model} with {e}:{type(e)}')
                                continue

    df_evals = pd.DataFrame.from_records([[er.name.__doc__.split("\n\n")[0], er.metrics["auc"], er.metrics["precision"], er.factors, er.iterations, er.alpha, er.confidence, sparsity, USS, ISS, users, channels , er.metrics["ndcg"], er.metrics["map"]] for er in evaluation_results], columns=["name",'AUC', 'precision', 'factors', 'iterations', 'alpha', 'confidence_measure', "sparsity", "USS", "ISS", "users", "channels", 'ndcg', 'map'])
    df_evals = df_evals.sort_values(['AUC', 'precision', 'iterations', 'factors'], ascending = [False, False, True, True])
    
    
    return df_evals  


In [None]:
df_org_evals = pd.DataFrame()
for org_unit in log_progress(data.org_unit_members):
    org_unit_memebers = data.org_unit_members[org_unit]
    org_df = df[df["user_id"].isin(org_unit_memebers)]
    if(org_df.empty):
        continue
    org_df["score"] = org_df["msg_count"].copy()
    df_org_evals = pd.concat([execute(org_df[["user_id", "channel_id", "score", "u_id", "c_id"]]) , df_org_evals])
    

In [None]:
df_org_evals.to_pickle("OrgUnitClustering.pkl")

In [None]:
df_teams_evals = pd.DataFrame()
for team in log_progress(data.teams):

    team_members = [m.user_id for m in team.team_members]
    team_df = df[df["user_id"].isin(team_members)]
    if(team_df.empty):
        continue        
    team_df["score"] = team_df["msg_count"]
    df_teams_evals = pd.concat([execute(team_df[["user_id", "channel_id", "score", "u_id", "c_id"]]) , df_teams_evals])

df_teams_evals.to_pickle("TeamsClustering.pkl")

In [None]:
df_teams_evals.to_pickle("TeamsClustering.pkl")

In [None]:
def collaborative_filtering_cluster(user_ids, df):    
    cluster_df = df[df["user_id"].isin(user_ids)].copy()    
    if(cluster_df.empty):
        return        
    cluster_df["score"] = cluster_df["msg_count"]
    return execute(cluster_df[["user_id", "channel_id", "score", "u_id", "c_id"]])    

In [None]:
cluster_evals = {}

clustering_path = "Processed/Clustering"
cluser_plks = [f for f in os.listdir(clustering_path) if os.path.isfile(os.path.join(clustering_path, f))]
cluser_plks = [cpks for cpks in cluser_plks if cpks.startswith('clusters-0.5') or cpks.startswith('clusters-1-')]

for cluser_plk in log_progress(cluser_plks):
    cluster_evals[cluser_plk] =  pd.DataFrame()
    df_cluster  = pd.read_pickle(os.path.join(clustering_path, cluser_plk))   
    result = df_cluster['nodes'].apply(collaborative_filtering_cluster, args=(df, ))    
    for ix, val in result.items():    
        if(val is None):
            continue
        cluster_evals[cluser_plk] = pd.concat([val , cluster_evals[cluser_plk]])            

for key in cluster_evals:    
    cluster_evals[key].to_pickle(f"{key}.pkl")