In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats

from sklearn import metrics
from sklearn import model_selection


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.models import Model
from keras.models import load_model

## Load and pre-process the data set

In [2]:
train = pd.read_csv('./Data/train.csv')
print('loaded %d records' % len(train))

# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

train = convert_dataframe_to_bool(train)

loaded 1804874 records


## Split the data into 80% train and 20% validate sets

In [3]:
train_df, validate_df = model_selection.train_test_split(train, test_size=0.2)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))


1443899 train comments, 360975 validate comments


## Create a text tokenizer

In [4]:
MAX_NUM_WORDS = 10000
TOXICITY_COLUMN = 'target'
TEXT_COLUMN = 'comment_text'

# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df[TEXT_COLUMN])

# All comments must be truncated or padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

## Define and train a Convolutional Neural Net for classifying toxic comments

In [5]:
EMBEDDINGS_PATH = './Embedding_file/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

def train_model(train_df, validate_df, tokenizer):
    # Prepare data
    train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
    train_labels = to_categorical(train_df[TOXICITY_COLUMN])
    validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
    validate_labels = to_categorical(validate_df[TOXICITY_COLUMN])

    # Load embeddings
    print('loading embeddings')
    embeddings_index = {}
    with open(EMBEDDINGS_PATH, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
                                 EMBEDDINGS_DIMENSION))
    num_words_in_embedding = 0
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            num_words_in_embedding += 1
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # Create model layers.
    def get_convolutional_neural_net_layers():
        """Returns (input_layer, output_layer)"""
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    EMBEDDINGS_DIMENSION,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        x = embedding_layer(sequence_input)
        x = Conv1D(128, 2, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(40, padding='same')(x)
        x = Flatten()(x)
        x = Dropout(DROPOUT_RATE)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='softmax')(x)
        return sequence_input, preds

    # Compile model.
    print('compiling model')
    input_layer, output_layer = get_convolutional_neural_net_layers()
    model = Model(input_layer, output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(learning_rate=LEARNING_RATE),
                  metrics=['acc'])

    # Train model.
    print('training model')
    model.fit(train_text,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              validation_data=(validate_text, validate_labels),
              verbose=2)

    return model

model = train_model(train_df, validate_df, tokenizer)

loading embeddings
compiling model




training model
Epoch 1/10
11281/11281 - 1059s - 94ms/step - acc: 0.9326 - loss: 0.1966 - val_acc: 0.9376 - val_loss: 0.1724
Epoch 2/10
11281/11281 - 969s - 86ms/step - acc: 0.9422 - loss: 0.1626 - val_acc: 0.9437 - val_loss: 0.1576
Epoch 3/10
11281/11281 - 921s - 82ms/step - acc: 0.9444 - loss: 0.1548 - val_acc: 0.9450 - val_loss: 0.1536
Epoch 4/10
11281/11281 - 908s - 80ms/step - acc: 0.9456 - loss: 0.1504 - val_acc: 0.9455 - val_loss: 0.1511
Epoch 5/10
11281/11281 - 850s - 75ms/step - acc: 0.9463 - loss: 0.1476 - val_acc: 0.9456 - val_loss: 0.1517
Epoch 6/10
11281/11281 - 1042s - 92ms/step - acc: 0.9470 - loss: 0.1455 - val_acc: 0.9451 - val_loss: 0.1512
Epoch 7/10
11281/11281 - 1028s - 91ms/step - acc: 0.9474 - loss: 0.1439 - val_acc: 0.9460 - val_loss: 0.1497
Epoch 8/10
11281/11281 - 969s - 86ms/step - acc: 0.9477 - loss: 0.1427 - val_acc: 0.9450 - val_loss: 0.1505
Epoch 9/10
11281/11281 - 795s - 71ms/step - acc: 0.9481 - loss: 0.1414 - val_acc: 0.9455 - val_loss: 0.1494
Epoch 10/1

## Generate model predictions on the validation set

In [6]:
MODEL_NAME = 'my_model'
validate_df[MODEL_NAME] = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]

[1m11281/11281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 22ms/step


In [21]:
validate_df.head(20)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,my_model
1601749,6081961,False,"I haven't read the article yet, and am incline...",0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,2,0,0.0,0,4,0.001639
1252708,5646267,True,A man dies and all you want to do is blame it ...,0.0,0.166667,0.0,0.833333,0.0,0.0,0.0,...,approved,1,0,1,9,0,0.0,4,6,0.938627
516364,875620,False,"Koncerned,\n\n We should be looking at sal...",0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,2,2,0.0,0,4,0.002605
35165,284866,False,For every nominee that the Republican majority...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,10,0,0.0,0,4,0.003578
1172552,5548794,False,The only reason why he's a Canadian is that hi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,rejected,0,0,0,1,0,0.0,4,4,0.00469
492860,848141,False,(Cont'd)\nThat leadership must a) demonstrate ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,approved,0,0,0,5,0,0.0,10,4,0.019165
291512,598895,False,Both Duffy and Wright should be prohibited fro...,0.0,0.0,0.0,0.166667,0.0,,,...,approved,0,0,0,1,0,0.0,0,6,0.003448
422427,760662,False,no it's not Y+A= No it's not zero populatio...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,approved,0,1,0,0,0,0.0,4,4,0.228106
1609610,6092261,False,"Instead of copying the company news release, i...",0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,0,0,0.0,0,4,0.000977
30107,278708,False,LM - You have no state taxes withheld from you...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,3,0,0.0,0,4,0.00529


## Define bias metrics, then evaluate our new model for bias using the validation set predictions

In [8]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
bias_metrics_df


  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_s

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,2171,0.796211,0.773837,0.953297
6,black,2950,0.799785,0.767659,0.956518
5,muslim,4246,0.819911,0.816852,0.945829
7,white,4937,0.821602,0.768996,0.961558
4,jewish,1469,0.841897,0.863368,0.92795
8,psychiatric_or_mental_illness,995,0.872422,0.858607,0.945082
0,male,8875,0.875165,0.862522,0.943594
1,female,10670,0.880328,0.878394,0.936239
3,christian,8092,0.893468,0.913464,0.916502


## Calculate the final score

In [9]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
get_final_metric(bias_metrics_df, calculate_overall_auc(validate_df, MODEL_NAME))

0.88432813949746

## Prediction on Test data

In [10]:
test = pd.read_csv('./Data/test.csv')
submission = pd.read_csv('./Data/sample_submission.csv', index_col='id')

In [11]:
submission['prediction'] = model.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')

[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 21ms/step


## Value Alignment

In [29]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [30]:
import pandas as pd
import numpy as np
from sklearn import metrics

# 假设我们已经有一个训练好的模型和验证数据集 validate_df
# 并且模型已经预测了验证集的结果
MODEL_NAME = 'my_model'
TEXT_COLUMN = 'comment_text'
TOXICITY_COLUMN = 'toxic'

# 随机选择10条评论
num_samples = 10
sampled_comments = validate_df.sample(num_samples, random_state=5)


# 获取模型对这些评论的预测结果
predictions = model.predict(pad_text(sampled_comments[TEXT_COLUMN], tokenizer))[:, 1]
predicted_labels = (predictions > 0.5).astype(int)

# 创建一个数据框来比较结果
comparison_df = pd.DataFrame({
    'comment': sampled_comments[TEXT_COLUMN].values,
    'predicted_label': predicted_labels,
    'prediction_score': predictions
})

# 打印比较结果
print(comparison_df)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     comment  \
0  Dave, I agree the was one of the most entertaining baseball games ever.  I often find baseball boring, especially if I don't have a friend or relative playing.  Game 2 was a great battle going back and forth.  How about an umpire taking a ball to his crotch when a steal at second base was in play!                                                                                   

## Demographic Parity

In [36]:
import pandas as pd

# 定义敏感属性
sensitive_attributes = ['male', 'female', 'christian', 'muslim', 'jewish', 'white', 'black']

# 初始化一个字典来存储各子群体的比例
subgroup_positive_rate = {}

# 计算每个子群体的正类预测比例
for attribute in sensitive_attributes:
    subgroup_data = validate_df[validate_df[attribute] == 1]
    positive_rate = subgroup_data[MODEL_NAME].mean()
    subgroup_positive_rate[attribute] = positive_rate

# 计算总体的正类预测比例
overall_positive_rate = validate_df[MODEL_NAME].mean()

# 打印结果
print("Overall Positive Rate: {:.4f}".format(overall_positive_rate))
for attribute, rate in subgroup_positive_rate.items():
    print("Subgroup '{}' Positive Rate: {:.4f}".format(attribute, rate))

# 比较各子群体的预测比例与总体预测比例
for attribute, rate in subgroup_positive_rate.items():
    disparity = abs(rate - overall_positive_rate)
    print("Disparity difference for '{}': {:.4f}".format(attribute, disparity))
    
for attribute, rate in subgroup_positive_rate.items():
    disparity = abs(overall_positive_rate / rate)
    print("Disparity ratio for '{}': {:.4f}".format(attribute, disparity))


Overall Positive Rate: 0.0690
Subgroup 'male' Positive Rate: 0.1263
Subgroup 'female' Positive Rate: 0.1110
Subgroup 'christian' Positive Rate: 0.0704
Subgroup 'muslim' Positive Rate: 0.1662
Subgroup 'jewish' Positive Rate: 0.1175
Subgroup 'white' Positive Rate: 0.2253
Subgroup 'black' Positive Rate: 0.2327
Disparity difference for 'male': 0.0573
Disparity difference for 'female': 0.0420
Disparity difference for 'christian': 0.0014
Disparity difference for 'muslim': 0.0972
Disparity difference for 'jewish': 0.0484
Disparity difference for 'white': 0.1563
Disparity difference for 'black': 0.1637
Disparity ratio for 'male': 0.5466
Disparity ratio for 'female': 0.6217
Disparity ratio for 'christian': 0.9803
Disparity ratio for 'muslim': 0.4154
Disparity ratio for 'jewish': 0.5877
Disparity ratio for 'white': 0.3064
Disparity ratio for 'black': 0.2967
