In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats

from sklearn import metrics
from sklearn import model_selection


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.models import Model
from keras.models import load_model

## Load and pre-process the data set

In [10]:
train = pd.read_csv('./Data/train.csv')
print('loaded %d records' % len(train))

# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

train = convert_dataframe_to_bool(train)

loaded 1804874 records


## Split the data into 80% train and 20% validate sets

In [11]:
train_df, validate_df = model_selection.train_test_split(train, test_size=0.2)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))


1443899 train comments, 360975 validate comments


## Create a text tokenizer

In [12]:
MAX_NUM_WORDS = 10000
TOXICITY_COLUMN = 'target'
TEXT_COLUMN = 'comment_text'

# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df[TEXT_COLUMN])

# All comments must be truncated or padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

## Define and train a Convolutional Neural Net for classifying toxic comments

In [13]:
import pickle

# 这里使用保存好的
with open('my_model.pkl', 'rb') as f:
    model = pickle.load(f)


## Generate model predictions on the validation set

In [14]:
MODEL_NAME = 'my_model'
validate_df[MODEL_NAME] = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]

[1m11281/11281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 26ms/step


In [15]:
validate_df.head(20)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,my_model
168790,448414,False,They're atmospheric scientists. There's nothin...,0.0,0.0,0.0,0.166667,0.0,,,...,approved,0,0,0,0,0,0.0,0,6,0.535743
1053007,5404258,False,"Did you miss the over one year of discussions,...",0.0,0.0,0.0,0.0,0.0,,,...,approved,1,0,0,3,0,0.0,0,4,0.002325
1746044,6262511,False,"First, a sitting president can't be indicted\n...",0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,0,1,0.0,0,4,0.006512
150097,425826,False,I'm far from the only one. There are millions ...,0.0,0.0,0.3,0.4,0.0,0.0,0.0,...,approved,0,0,0,0,0,0.0,10,10,0.007217
1793908,6319838,False,"LOL! No, I'm proud of my Ducks. Their goal l...",0.0,0.0,0.0,0.0,0.0,,,...,approved,3,0,0,0,0,0.0,0,4,0.005643
1780006,6303364,False,"Yes, we do.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,approved,0,0,0,0,0,0.0,4,5,0.002806
987136,5325275,False,glad you also noticed that....\nalso there is ...,0.0,0.0,0.166667,0.166667,0.166667,,,...,approved,0,0,0,2,0,0.0,0,6,0.006407
1748910,6265972,False,"True, but Dallas also tried to trade up to the...",0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,0,0,0.0,0,4,0.032981
1290610,5691964,False,Please stop misrepresenting DeTocqueville. To ...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,0,0,0.0,0,4,0.0254
513367,872141,False,You are conveniently ignoring the fact that th...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,2,0,0.0,0,4,0.336717


## Define bias metrics, then evaluate our new model for bias using the validation set predictions

In [16]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
bias_metrics_df


  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_s

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
6,black,3031,0.532548,0.452617,0.658422
8,psychiatric_or_mental_illness,972,0.549919,0.269056,0.815847
3,christian,8211,0.557027,0.542359,0.59977
5,muslim,4157,0.561796,0.571698,0.575926
4,jewish,1533,0.571313,0.597413,0.555505
7,white,5200,0.57428,0.3329,0.792648
2,homosexual_gay_or_lesbian,2184,0.577169,0.530998,0.63161
1,female,10598,0.589578,0.501455,0.669298
0,male,9011,0.609013,0.47429,0.70978


## Calculate the final score

In [9]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
get_final_metric(bias_metrics_df, calculate_overall_auc(validate_df, MODEL_NAME))

0.88432813949746

## Prediction on Test data

In [10]:
test = pd.read_csv('./Data/test.csv')
submission = pd.read_csv('./Data/sample_submission.csv', index_col='id')

In [11]:
submission['prediction'] = model.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')

[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 21ms/step


## Value Alignment

In [29]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [30]:
import pandas as pd
import numpy as np
from sklearn import metrics

# 假设我们已经有一个训练好的模型和验证数据集 validate_df
# 并且模型已经预测了验证集的结果
MODEL_NAME = 'my_model'
TEXT_COLUMN = 'comment_text'
TOXICITY_COLUMN = 'toxic'

# 随机选择10条评论
num_samples = 10
sampled_comments = validate_df.sample(num_samples, random_state=5)


# 获取模型对这些评论的预测结果
predictions = model.predict(pad_text(sampled_comments[TEXT_COLUMN], tokenizer))[:, 1]
predicted_labels = (predictions > 0.5).astype(int)

# 创建一个数据框来比较结果
comparison_df = pd.DataFrame({
    'comment': sampled_comments[TEXT_COLUMN].values,
    'predicted_label': predicted_labels,
    'prediction_score': predictions
})

# 打印比较结果
print(comparison_df)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     comment  \
0  Dave, I agree the was one of the most entertaining baseball games ever.  I often find baseball boring, especially if I don't have a friend or relative playing.  Game 2 was a great battle going back and forth.  How about an umpire taking a ball to his crotch when a steal at second base was in play!                                                                                   

## Demographic Parity

In [1]:
import pandas as pd

# 定义敏感属性
sensitive_attributes = ['male', 'female', 'christian', 'muslim', 'jewish', 'white', 'black']

# 初始化一个字典来存储各子群体的比例
subgroup_positive_rate = {}

# 计算每个子群体的正类预测比例
for attribute in sensitive_attributes:
    subgroup_data = validate_df[validate_df[attribute] == 1]
    positive_rate = subgroup_data[MODEL_NAME].mean()
    subgroup_positive_rate[attribute] = positive_rate

# 计算总体的正类预测比例
overall_positive_rate = validate_df[MODEL_NAME].mean()

# 打印结果
print("Overall Positive Rate: {:.4f}".format(overall_positive_rate))
for attribute, rate in subgroup_positive_rate.items():
    print("Subgroup '{}' Positive Rate: {:.4f}".format(attribute, rate))

# 比较各子群体的预测比例与总体预测比例
for attribute, rate in subgroup_positive_rate.items():
    disparity = abs(rate - overall_positive_rate)
    print("Disparity difference for '{}': {:.4f}".format(attribute, disparity))
    
for attribute, rate in subgroup_positive_rate.items():
    disparity = abs(overall_positive_rate / rate)
    print("Disparity ratio for '{}': {:.4f}".format(attribute, disparity))


NameError: name 'validate_df' is not defined

In [35]:
# 保存模型
import pickle

# save the model
with open('my_model.pkl', 'wb') as f:
    pickle.dump(model, f)

