In [191]:
import pandas
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tqdm import tqdm

deepctr_path = '/root/linghui/rec-fairness/'
sys.path.append(deepctr_path)
import deepctr
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names

In [192]:
import sys
import tensorflow as tf
deepctr_path = '/root/linghui/rec-fairness/'
sys.path.append(deepctr_path)
import deepctr
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names

def get_feat_dict(data):
    if data == 'ml-1m':
        feat_dict = {'movie_id': 3706,
                 'user_id': 6040,
                 'gender': 2,
                 'age': 7,
                 'occupation': 21,
                 'zipcode': 3439}
    return feat_dict

def get_saved_model_details(model_params, data_params):
    model_name = model_params['model_name']
    cate = model_params['cate']
    if model_name == 'DeepFM':
        if cate == 'all-feat':
            check_path = '/data/linghui/saved_model/deepfm-ml-1m/deepfm-all-feature/deepfm-ml-1m.ckpt'
            selected_feat = ["movie_id", "user_id", "gender", "age", "occupation", "zipcode"]
        if cate == 'del-sf':
            check_path = '/data/linghui/saved_model/deepfm-ml-1m/deepfm-del-sf/deepfm-ml-1m-del-sf.ckpt'
            selected_feat = ["movie_id", "user_id"]
        if cate == 'liuyi-all-feats':
            check_path = '/data/linghui/saved_model/deepfm-ml-1m/deepfm-liuyi/deepfm-ml-1m-liuyi.ckpt'
            selected_feat = ["movie_id", "user_id", "gender", "age", "occupation", "zipcode"]
    return check_path , selected_feat
            

def get_model(model_params,data_params):
    dataset = data_params['dataset']

    model_name = model_params['model_name']
    cate = model_params['cate']
    embedding_dim = model_params['embedding_dim']
    check_path, selected_feature = get_saved_model_details(model_params,data_params)
    
    feat_dict = get_feat_dict(dataset)
    fixlen_feature_columns = [SparseFeat(feat, feat_dict[feat], embedding_dim=embedding_dim)
                                      for feat in selected_feature]
    linear_feature_columns = fixlen_feature_columns 
    dnn_feature_columns = fixlen_feature_columns 
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    

    if model_name == 'DeepFM':
        model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.summary()
    model.load_weights(check_path)
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy',
                  metrics=['AUC', 'Precision', 'Recall'])
    return model


def create_single_rule(key,value):
    return{key:value}

def get_single_sparse_rule(df,features):
    single_rule = []
    feat_value = df[features].value_counts()
    feat_value = feat_value.index
    sparse_feat_value = feat_value.values.astype('int32')
    for value in sparse_feat_value:
        rule = create_single_rule(features,value)
        single_rule.append(rule)
    return single_rule

def get_single_dense_rule(df,feature,k):
    min_value = df[feature].min()
    max_value = df[feature].max()
    single_rule = []
    d = math.ceil((max_value - min_value) / k)
    for i in range(k):
        x = min_value + i * d
        y = x + d
        single_rule.append({feature:[x,y]})
    return single_rule

def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

def combine_rule(rule_dict_1,rule_dict_2):
    len1 = len(rule_dict_1)
    len2 = len(rule_dict_2)
    all_rule = []
    for i in range(len1):
        rule1 = rule_dict_1[i]
        for j in range(len2):
            rule2 = rule_dict_2[j]
            merge_rule = merge_two_dicts(rule1,rule2)
            all_rule.append(merge_rule)
    return all_rule

def creat_sparse_rule_query(rule):
    result = ''
    key = rule.keys()
    length = len(key)
    count = 0
    for i in key:
        count = count + 1
        result = result + '( ' + i +' == ' + str(rule[i]) + ')'
        # elif i[0] == 'I':
        #     result = result + '(' + str(rule[i][0]) + ' <= ' + i + ' <= ' +  str(rule[i][1]) + ')'
        if count < length:
            result = result + ' & '
    return result

def get_all_rule_dict(data,sparse_feats,dense_feats,dense_num):
    single_rule_dict = []
    all_rule_dict = []
    if len(sparse_feats) > 0:
        for feature in sparse_feats:
            temp_rule = []
            single_rule = get_single_sparse_rule(data,feature)  
            single_rule_dict = single_rule_dict + single_rule
            if len(all_rule_dict) > 0: 
                temp_rule = combine_rule(all_rule_dict,single_rule)
            all_rule_dict = all_rule_dict + single_rule + temp_rule
    if len(dense_feats) > 0:
        for feature in dense_feats:
            temp_rule = []
            single_rule = get_single_dense_rule(data,feature,dense_num)  
            single_rule_dict = single_rule_dict + single_rule
            if len(all_rule_dict) > 0: 
                temp_rule = combine_rule(all_rule_dict,single_rule)
            all_rule_dict = all_rule_dict + single_rule + temp_rule 
    return all_rule_dict

def get_evaluate_data(data,rule):
    result = creat_sparse_rule_query(rule)
    select_group = data.query(result)
    unselect_group = data.drop(select_group.index)

    return select_group,unselect_group

def cut_rule(length_all,length_group,theta):
    x = length_group / length_all
    if (x >= theta) & (x < 1-theta):
        return 1
    else:
        return 0

def get_test_data(x_file,y_file):
    test_x = np.loadtxt(x_file)
    test_y = np.loadtxt(y_file)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    test_x = pd.DataFrame(test_x,columns=features)
    test_y = pd.DataFrame(test_y,columns=['label'])

    test = pd.concat([test_x, test_y], axis=1)
    return test

In [193]:
saved_model_path = '/data/linghui/saved_model/deepfm-ml-1m/'

saved_data_path = '/data/linghui/ml-1m/processed_data/'

test_data_path = saved_data_path + 'test_data_liuyi.csv'

#load data
test_data = pd.read_csv(test_data_path,index_col=0)

In [194]:
test_data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zipcode
0,0,47,1,978824351,Pocahontas (1995),Animation|Children's|Musical|Romance,0,0,10,1588
1,1,1737,1,978300174,Everest (1998),Documentary,1,6,16,2248
2,2,1900,1,978298504,Nightmare on Elm Street Part 2: Freddy's Reven...,Horror,1,2,15,1863
3,3,1774,1,978294282,Slappy and the Stinkers (1998),Children's|Comedy,1,4,7,140
4,4,279,1,978246585,Nell (1994),Drama,1,2,20,1938
...,...,...,...,...,...,...,...,...,...,...
301995,6039,2993,0,0,"Longest Day, The (1962)",Action|Drama|War,1,2,6,466
301996,6039,2166,0,0,One Man's Hero (1999),Drama|War,1,2,6,466
301997,6039,286,0,0,Only You (1994),Comedy|Romance,1,2,6,466
301998,6039,70,0,0,Fair Game (1995),Action,1,2,6,466


In [None]:
from function_set import *

In [20]:
# model_name: DeepFM / 
# cate : all-feat / del-sf

model_params = {'model_name': 'DeepFM',
                'cate': 'liuyi-all-feats',
                'embedding_dim': 4,
                'batch_size': 256 }

data_params = {'dataset': 'ml-1m'}

check_path ,sparse_features = get_saved_model_details(model_params, data_params)

In [21]:
check_path

'/data/linghui/saved_model/deepfm-ml-1m/deepfm-liuyi/deepfm-ml-1m-liuyi.ckpt'

In [22]:
model = get_model(model_params,data_params)

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
_______________________________________________________________________________________

In [23]:
test_input = {name: test_data[name].values for name in sparse_features}
label = test_data['rating'].values

In [25]:
label

array([1, 1, 1, ..., 0, 0, 0])

In [26]:
y_pre = model.predict(test_input)

In [32]:
select_feats = ['user_id',"movie_id","rating","gender","age","occupation","zipcode"]

evaluate_data = test_data.loc[:,select_feats]
evaluate_data['predict'] = y_pre

In [195]:
evaluate_data

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
0,0,47,1,0,0,10,1588,0.939831
1,1,1737,1,1,6,16,2248,0.823337
2,2,1900,1,1,2,15,1863,0.927760
3,3,1774,1,1,4,7,140,0.912610
4,4,279,1,1,2,20,1938,0.555670
...,...,...,...,...,...,...,...,...
301995,6039,2993,0,1,2,6,466,0.171150
301996,6039,2166,0,1,2,6,466,0.419440
301997,6039,286,0,1,2,6,466,0.032339
301998,6039,70,0,1,2,6,466,0.333478


In [196]:
num = len(evaluate_data.user_id.value_counts())
rec_result = pd.DataFrame(columns=['user_id',"movie_id","rating","gender","age","occupation","zipcode"])
rec_num = 5
for i in range(num):
    user_select_group = evaluate_data.loc[evaluate_data['user_id'] == i]
    user_select_group = user_select_group.sort_values(['predict'],ascending=False)
    user_select_group = user_select_group.head(rec_num)
    rec_result = pd.concat([rec_result, user_select_group])
rec_result

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6079,0,33,0,0,0,10,1588,0.972540
0,0,47,1,0,0,10,1588,0.939831
6059,0,1294,0,0,0,10,1588,0.909157
6076,0,1371,0,0,0,10,1588,0.879133
6045,0,1832,0,0,0,10,1588,0.765513
...,...,...,...,...,...,...,...,...
6039,6039,1131,1,1,2,6,466,0.956136
301985,6039,1883,0,1,2,6,466,0.943117
301982,6039,1200,0,1,2,6,466,0.906736
301986,6039,2099,0,1,2,6,466,0.887011


In [47]:
saved_data_path = '/data/linghui/ml-1m/processed_data/'
rec_result.to_csv(saved_data_path + 'rec_result.csv')

In [128]:
rec_result

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6079,0,33,0,0,0,10,1588,0.972540
0,0,47,1,0,0,10,1588,0.939831
6059,0,1294,0,0,0,10,1588,0.909157
6076,0,1371,0,0,0,10,1588,0.879133
6045,0,1832,0,0,0,10,1588,0.765513
...,...,...,...,...,...,...,...,...
6039,6039,1131,1,1,2,6,466,0.956136
301985,6039,1883,0,1,2,6,466,0.943117
301982,6039,1200,0,1,2,6,466,0.906736
301986,6039,2099,0,1,2,6,466,0.887011


In [49]:
batch_size = model_params['batch_size']
test_auc = model.evaluate(test_input, label, batch_size=batch_size)[1]



In [50]:
sparse_feats = ["gender","age","occupation"]
dense_feats = []
dense_num = 10
all_rule_dict = get_all_rule_dict(rec_result,sparse_feats,dense_feats,dense_num)

In [262]:
len(all_rule_dict)

527

In [264]:
rule_test = all_rule_dict[0]
rule_test

{'gender': 1}

In [265]:
select_group,unselect_group = get_evaluate_data(rec_result,rule_test)

In [211]:
user_id = unselect_group['user_id'].unique()
len(user_id)

6040

In [189]:
import math
def cal_ndcg(data):
    user_id = data['user_id'].unique()
    ndcg_list = []
    for i in user_id:
        user_select_group = data.loc[data['user_id'] == i]
        user_select_group = user_select_group.reset_index(drop = True)
        rec_pos = user_select_group.loc[user_select_group['rating'] == 1]
        if len(rec_pos) == 0:
            ndcg = 0
        else:
            pos_index = rec_pos.index[0] + 1  
            pre_score = rec_pos.iloc[0].loc['predict']
            ndcg = (pow(2,pre_score)-1) / (math.log(pos_index+1,2)) 
        ndcg_list.append(ndcg)
    return np.mean(ndcg_list)

def cal_mean_pos_prescore(data):
    user_id = data['user_id'].unique()
    pre_score_list = []
    for i in user_id:
        user_select_group = data.loc[data['user_id'] == i]
        rec_pos = user_select_group.loc[user_select_group['rating'] == 1]
        if len(rec_pos) == 0:
            pre_score = 0
        else:
            pre_score = rec_pos.iloc[0].loc['predict']
        pre_score_list.append(pre_score)
    return np.mean(pre_score_list)

def cal_arhr(data):
    user_id = data['user_id'].unique()
    arhr_list = []
    for i in user_id:
        user_select_group = data.loc[data['user_id'] == i]
        user_select_group = user_select_group.reset_index(drop = True)
        rec_pos = user_select_group.loc[user_select_group['rating'] == 1]
        if len(rec_pos) == 0:
            arhr = 0
        else:
            pos_index = rec_pos.index[0] + 1  
            arhr = 1/pos_index
        arhr_list.append(arhr)
    return np.mean(arhr_list)

def cal_hitrate(data):
    user_id = data['user_id'].unique()
    hit_count = 0
    for i in user_id:
        user_select_group = data.loc[data['user_id'] == i]
        rec_pos = user_select_group.loc[user_select_group['rating'] == 1]
        if len(rec_pos) != 0:
            hit_count = hit_count + 1
    hit_rate = hit_count / len(user_id)
    return hit_rate

In [147]:
select_group,unselect_group = get_evaluate_data(rec_result,rule_test)
select_group

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6113,1,525,0,1,6,16,2248,0.900945
6111,1,1215,0,1,6,16,2248,0.874945
6094,1,327,0,1,6,16,2248,0.859985
6110,1,3029,0,1,6,16,2248,0.838863
1,1,1737,1,1,6,16,2248,0.823337
...,...,...,...,...,...,...,...,...
6039,6039,1131,1,1,2,6,466,0.956136
301985,6039,1883,0,1,2,6,466,0.943117
301982,6039,1200,0,1,2,6,466,0.906736
301986,6039,2099,0,1,2,6,466,0.887011


In [158]:
l = cal_arhr(select_group)
l_un = cal_arhr(unselect_group)
l - l_un

-0.010924834306738573

In [157]:
l = cal_hitrate(select_group)
l_un = cal_hitrate(unselect_group)
l - l_un

-0.01769598492450153

In [124]:
l_un = cal_mean_pos_prescore(unselect_group)
np.mean(l_un)

0.7053264874352291

In [184]:
ug = rec_result.loc[rec_result['user_id'] == 7]
# ug = ug.reset_index(drop = True)
# rc = ug.loc[ug['rating'] == 1]
# pos_index = rc.index[0]
# pos_index
ug

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6424,7,3512,0,1,2,12,488,0.855682
7,7,3033,1,1,2,12,488,0.775204
6403,7,3074,0,1,2,12,488,0.692529
6416,7,503,0,1,2,12,488,0.64773
6411,7,984,0,1,2,12,488,0.528795


In [201]:
length_group = len(select_group['user_id'].unique())
length_all

6040

In [229]:
rec_result

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6079,0,33,0,0,0,10,1588,0.972540
0,0,47,1,0,0,10,1588,0.939831
6059,0,1294,0,0,0,10,1588,0.909157
6076,0,1371,0,0,0,10,1588,0.879133
6045,0,1832,0,0,0,10,1588,0.765513
...,...,...,...,...,...,...,...,...
6039,6039,1131,1,1,2,6,466,0.956136
301985,6039,1883,0,1,2,6,466,0.943117
301982,6039,1200,0,1,2,6,466,0.906736
301986,6039,2099,0,1,2,6,466,0.887011


In [230]:
import time

time_start=time.time()
hit_rate = []
pos_score = []
ar_hit_rate = []
rule_dict = []
theta = 0.1
length_all = len(evaluate_data['user_id'].unique())

for rule in all_rule_dict:
    #get selected group
    select_group,unselect_group = get_evaluate_data(final_result,rule)
    length_group = len(select_group['user_id'].unique())
    if cut_rule(length_all,length_group,theta) == 1:
        rule_dict.append(rule)
        #calculate metrics
        arhr_se = cal_arhr(select_group)
        arhr_unse = cal_arhr(unselect_group)
        ar_hit_rate.append(arhr_se - arhr_unse)

        hr_se = cal_hitrate(select_group)
        hr_unse = cal_hitrate(unselect_group)
        hit_rate.append(hr_se - hr_unse)
time_end_1=time.time()
print('time cost:',time_end_1-time_start,'s')

for rule in all_rule_dict:
    #get selected group
    select_group,unselect_group = get_evaluate_data(final_result,rule)
    length_group = len(select_group['user_id'].unique())
    if cut_rule(length_all,length_group,theta) == 1:
        #calculate metrics
        pos_se = cal_mean_pos_prescore(select_group)
        pos_unse = cal_mean_pos_prescore(unselect_group)
        pos_score.append(pos_se - pos_unse)  
        
time_end_2=time.time()
print('time cost:',time_end_2-time_end_1,'s')

time cost: 305.8480739593506 s
time cost: 88.93714308738708 s


In [391]:
max_arhr = max(ar_hit_rate)
rule = rule_dict[ar_hit_rate.index(max_arhr)]
max_arhr,rule

(0.03163580975228952, {'age': 1})

In [390]:
max_hr = max(hit_rate)
rule = rule_dict[hit_rate.index(max_hr)]
max_hr,rule

(0.05091862582830564, {'gender': 1, 'age': 1})

In [389]:
max_pos = max(pos_score)
rule = rule_dict[pos_score.index(max_pos)]
max_pos,rule

(0.018379663507690713, {'gender': 0})

In [255]:
time_start=time.time()

ndcg_list = []

length_all = len(evaluate_data['user_id'].unique())
for rule in all_rule_dict:
    #get selected group
    select_group,unselect_group = get_evaluate_data(final_result,rule)
    length_group = len(select_group['user_id'].unique())
    if cut_rule(length_all,length_group,theta) == 1:
        #calculate metrics
        ndcg_se =cal_ndcg(select_group)
        ndcg_unse = cal_ndcg(unselect_group)
        ndcg_list.append(ndcg_se - ndcg_unse)  
        
time_end_2=time.time()
print('time cost:',time_end_2-time_start,'s')

time cost: 92.52309250831604 s


In [259]:
rule_dict

[{'gender': 1},
 {'gender': 0},
 {'age': 2},
 {'age': 3},
 {'age': 1},
 {'gender': 1, 'age': 2},
 {'gender': 1, 'age': 3},
 {'gender': 1, 'age': 1},
 {'occupation': 4},
 {'occupation': 0},
 {'occupation': 7}]

In [260]:
ndcg_list

[-0.018813012630456982,
 0.018813012630456982,
 -0.0002811683140094834,
 0.0005422315724805671,
 0.00843493954227792,
 -0.0074702001450400735,
 -0.005157569547623364,
 0.007474102000129101,
 -0.020259903497833354,
 -0.011900602429696372,
 0.008046935787265785]

In [388]:
max_ndcg = max(ndcg_list)
rule = rule_dict[ndcg_list.index(max_ndcg)]
max_ndcg,rule

(0.018813012630456982, {'gender': 0})

In [258]:
time_start=time.time()


length_all = len(evaluate_data['user_id'].unique())
for rule in all_rule_dict:
    #get selected group
    select_group,unselect_group = get_evaluate_data(evaluate_data,rule)
    length_group = len(select_group['user_id'].unique())
    if cut_rule(length_all,length_group,theta) == 1:
        rule_dict.append(rule)
        
time_end_2=time.time()
print('time cost:',time_end_2-time_start,'s')

time cost: 14.915152549743652 s


In [266]:
select_group

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6113,1,525,0,1,6,16,2248,0.900945
6111,1,1215,0,1,6,16,2248,0.874945
6094,1,327,0,1,6,16,2248,0.859985
6110,1,3029,0,1,6,16,2248,0.838863
1,1,1737,1,1,6,16,2248,0.823337
...,...,...,...,...,...,...,...,...
6039,6039,1131,1,1,2,6,466,0.956136
301985,6039,1883,0,1,2,6,466,0.943117
301982,6039,1200,0,1,2,6,466,0.906736
301986,6039,2099,0,1,2,6,466,0.887011


In [None]:
def cal_hitrate(data):
    user_id = data['user_id'].unique()
    hit_count = 0
    for i in user_id:
        user_select_group = data.loc[data['user_id'] == i]
        rec_pos = user_select_group.loc[user_select_group['rating'] == 1]
        if len(rec_pos) != 0:
            hit_count = hit_count + 1
    hit_rate = hit_count / len(user_id)
    return hit_rate

movie_id = select_group['movie_id'].unique()

In [293]:
movie_id = select_group['movie_id'].unique()
movie_id

array([525, 1215, 327, ..., 2409, 3470, 3044], dtype=object)

In [374]:
a = select_group.loc[select_group['movie_id'] == 1].head(5)
a

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
301575,6031,1,0,1,4,7,1855,0
300811,6015,1,0,1,4,1,1289,1
295081,5898,1,0,1,3,17,1059,0
294740,5891,1,0,1,4,2,455,0
292333,5842,1,0,1,3,1,2592,1


In [375]:
b = unselect_group.loc[unselect_group['movie_id'] == 1].head(5)
b

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
298727,5973,1,0,0,2,1,1143,0
296097,5919,1,0,0,2,1,252,0
276712,5523,1,0,0,0,10,961,1
5350,5350,1,1,0,4,1,2655,1
253087,5041,1,0,0,1,2,1903,0


In [320]:
b_index = list(b.index)
b_index

[6113, 6111, 6094, 6110, 1]

In [345]:
a = a.drop(index= b.index)
a['predict'] = 0

In [346]:
b['predict'] = 1
b
a = pd.concat([b,a])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b['predict'] = 1


In [354]:
temp_data = evaluate_data.drop(index= rec_result.index)
temp_data['predict'] = 0
rec_result['predict'] = 1
final_result = pd.concat([temp_data,rec_result])
final_result =  final_result.sort_values(['user_id','predict'],ascending=False)

In [355]:
final_result

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6039,6039,1131,1,1,2,6,466,1
301985,6039,1883,0,1,2,6,466,1
301982,6039,1200,0,1,2,6,466,1
301986,6039,2099,0,1,2,6,466,1
301988,6039,2882,0,1,2,6,466,1
...,...,...,...,...,...,...,...,...
6084,0,2767,0,0,0,10,1588,0
6085,0,2201,0,0,0,10,1588,0
6086,0,1111,0,0,0,10,1588,0
6087,0,1460,0,0,0,10,1588,0


In [356]:
select_group,unselect_group = get_evaluate_data(final_result,rule_test)
select_group

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zipcode,predict
6039,6039,1131,1,1,2,6,466,1
301985,6039,1883,0,1,2,6,466,1
301982,6039,1200,0,1,2,6,466,1
301986,6039,2099,0,1,2,6,466,1
301988,6039,2882,0,1,2,6,466,1
...,...,...,...,...,...,...,...,...
6133,1,2769,0,1,6,16,2248,0
6134,1,2992,0,1,6,16,2248,0
6135,1,359,0,1,6,16,2248,0
6136,1,666,0,1,6,16,2248,0


In [378]:
def cal_val_unfairness(select_group,unselect_group):
    movie_id = select_group['movie_id'].unique()
    val_unfairness_list = []
    for i in movie_id:
        user_select_group = select_group.loc[select_group['movie_id'] == i]
        if len(user_select_group) == 0:
            val_uf_se = 0
        else:
            rating_numpy = user_select_group['rating'].to_numpy()
            pre_numpy = user_select_group['predict'].to_numpy()
            val_uf_se = np.mean( pre_numpy - rating_numpy)
        user_unselect_group = unselect_group.loc[unselect_group['movie_id'] == i]
        if len(user_unselect_group) == 0:
            val_uf_unse = 0
        else:
            rating_numpy_unse = user_unselect_group['rating'].to_numpy()
            pre_numpy_unse = user_unselect_group['predict'].to_numpy()
            val_uf_unse = np.mean(pre_numpy_unse - rating_numpy_unse)
        val_unfairness_list.append(val_uf_se - val_uf_unse)
    return(np.mean(val_unfairness_list))
        


In [379]:
x = cal_val_unfairness(a,b)

In [382]:
value_unfairness_list = []
for rule in all_rule_dict:
    #get selected group
    select_group,unselect_group = get_evaluate_data(final_result,rule)
    length_group = len(select_group['user_id'].unique())
    if cut_rule(length_all,length_group,theta) == 1:
        #calculate metrics
        value = cal_val_unfairness(select_group,unselect_group)
        value_unfairness_list.append(value)

In [387]:
max_val = max(value_unfairness_list)
rule = rule_dict[value_unfairness_list.index(max_val)]
max_val,rule

(0.003917036752756318, {'gender': 1, 'age': 2})

In [386]:
value_unfairness_list

[0.0019920039228254805,
 -0.001992003922825481,
 0.0029545308230729908,
 -0.00042271724421300716,
 0.0014951111646339295,
 0.003917036752756318,
 0.0007309179459581483,
 0.002166523111922714,
 0.0032744863854328064,
 0.0019684966756195717,
 -0.0008708950211761689]