In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
#For DistilBert
from tokenizers import BertWordPieceTokenizer 
#For XLMR2
from sklearn.metrics import roc_auc_score 
import logging
# no extensive logging 
logging.getLogger().setLevel(logging.NOTSET)

#For eda
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import accuracy_score
!pip install -q googletrans
from googletrans import Translator
from colorama import Fore, Back, Style, init
import plotly.graph_objects as go
translator = Translator()
from tensorflow.keras import layers
from tensorflow.keras.layers import *
from tensorflow.keras.layers import (Dense, Input, LSTM, Bidirectional, Activation, Conv1D, 
                                     GRU,Embedding, Flatten, Dropout, Add, concatenate, MaxPooling1D,
                                     GlobalAveragePooling1D,  GlobalMaxPooling1D, 
                                     GlobalMaxPool1D,SpatialDropout1D)

from tensorflow.keras import (initializers, regularizers, constraints, 
                              optimizers, layers, callbacks)

sns.set(style="darkgrid")
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Same for tpu roberta and bert
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
#For tpu roberta
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

In [None]:
%%time 

x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=1
)

n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=1
)

In [None]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission1.csv', index=False)

In [None]:
#For Bert as we start distilbert here
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'distilbert-base-multilingual-cased'

tokenizer = transformers.DistilBertTokenizer.from_pretrained(MODEL)

In [None]:
%%time 

x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)


In [None]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission2.csv', index=False)

In [None]:
#For XLMR2
MAX_LEN = 192
LR = 1e-5
BATCH_SIZE = 8 # per TPU core, reduced to fit on a TPUv2
TOTAL_STEPS_STAGE1 = 2000  # increased the number of steps for smaller batches
VALIDATE_EVERY_STAGE1 = 500
TOTAL_STEPS_STAGE2 = 1000
VALIDATE_EVERY_STAGE2 = 500

PRETRAINED_MODEL = 'jplu/tf-xlm-roberta-large'

# The path to the data on my drive
D = '../input/jigsaw-multilingual-toxic-comment-classification/'

print(tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size


def regular_encode1(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])


def create_dist_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
         dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset

def create_model_and_optimizer():
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(PRETRAINED_MODEL)                
        model = build_model1(transformer_layer)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)
        return model, optimizer


def build_model1(transformer):
    inp = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    # Huggingface transformers have multiple outputs, embeddings are the first one
    # let's slice out the first position, the paper says its not worse than pooling
    x = transformer(inp)[0][:, 0, :]  
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[inp], outputs=[out])
    
    return model


def define_losses_and_metrics():
    with strategy.scope():
        loss_object = tf.keras.losses.BinaryCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=False)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_batch_size)
            return loss

        train_accuracy_metric = tf.keras.metrics.AUC(name='training_AUC')

    return compute_loss, train_accuracy_metric



def train1(train_dist_dataset, val_dist_dataset=None, y_val=None,
          total_steps=5000, validate_every=500):
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:
        distributed_train_step(tensor) 
        step+=1

        if (step % validate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_accuracy_metric.result().numpy()
            print("Step %d, train AUC: %.5f" % (step, train_metric))   
            
            ### Test loop with exact AUC ###
            if val_dist_dataset:
                val_metric = roc_auc_score(y_val, predict1(val_dist_dataset))
                print("     validation AUC: %.5f" %  val_metric)   

            ### Reset (train) metrics ###
            train_accuracy_metric.reset_states()
            
        if step  == total_steps:
            break
@tf.function
def distributed_train_step(data):
    strategy.experimental_run_v2(train_step, args=(data,))

def train_step(inputs):
    features, labels = inputs

    with tf.GradientTape() as tape:
        predictions = model(features, training=True)
        loss = compute_loss(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_accuracy_metric.update_state(labels, predictions)

def predict1(dataset):  
    predictions = []
    for tensor in dataset:
        predictions.append(distributed_prediction_step(tensor))
         ### stack replicas and batches
    predictions = np.vstack(list(map(np.vstack,predictions)))
    return predictions

@tf.function
def distributed_prediction_step(data):
    predictions = strategy.experimental_run_v2(prediction_step, args=(data,))
    return strategy.experimental_local_results(predictions)

def prediction_step(inputs):
    features = inputs  # note datasets used in prediction do not have labels
    predictions = model(features, training=False)
    return predictions

In [None]:
tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)

compute_loss, train_accuracy_metric = define_losses_and_metrics()

In [None]:
%%time 
### Load ###
train_df = pd.read_csv(D+'jigsaw-toxic-comment-train.csv')
val_df = pd.read_csv(D+'validation.csv')
test_df = pd.read_csv(D+'test.csv')
sub_df = pd.read_csv(D+'sample_submission.csv')

### subsample the train dataframe to 50%-50%  ###
train_df = pd.concat([
    train_df.query('toxic==1'),
    train_df.query('toxic==0').sample(sum(train_df.toxic),random_state=42)
])
### shufle it just to make sure ###
train_df = train_df.sample(frac=1, random_state = 42)

### Tokenize  ###
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
X_train = regular_encode1(train_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_val = regular_encode1(val_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_test = regular_encode1(test_df.content.values, tokenizer, maxlen=MAX_LEN)

### Make appropriate target shapes ###
y_train = train_df.toxic.values.reshape(-1,1)
y_val = val_df.toxic.values.reshape(-1,1)

### Create datasets  ###
train_dist_dataset = create_dist_dataset(X_train, y_train, True)
val_dist_dataset   = create_dist_dataset(X_val)
test_dist_dataset  = create_dist_dataset(X_test)

In [None]:
model, optimizer = create_model_and_optimizer()

In [None]:
train1(train_dist_dataset, val_dist_dataset, y_val,
      TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)

In [None]:
%%time
# make a new dataset for training with the validation data 
# with targets, shuffling and repeating
val_dist_dataset_4_training = create_dist_dataset(X_val, y_val, training=True)

# train again
train1(val_dist_dataset_4_training,
      total_steps = TOTAL_STEPS_STAGE2, 
      validate_every = VALIDATE_EVERY_STAGE2)  # not validating but printing now

In [None]:
%%time
sub_df['toxic'] = predict1(test_dist_dataset)[:,0]
sub_df.to_csv('submission3.csv', index=False)

In [None]:
MAX_LEN = 192  #Reduced for quicker execution
LR = 1e-5
BATCH_SIZE = 16 # per TPU core
TOTAL_STEPS_STAGE1 = 300
VALIDATE_EVERY_STAGE1 = 100
TOTAL_STEPS_STAGE2 = 200
VALIDATE_EVERY_STAGE2 = 100

PRETRAINED_MODEL = 'jplu/tf-xlm-roberta-large'
D = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'

import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import transformers
from transformers import TFAutoModel, AutoTokenizer
import logging
# no extensive logging 
logging.getLogger().setLevel(logging.NOTSET)

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train_df = pd.read_csv(D+'jigsaw-toxic-comment-train.csv')
val_df = pd.read_csv(D+'validation.csv')
test_df = pd.read_csv(D+'test.csv')
sub_df = pd.read_csv(D+'sample_submission.csv')

# subsample the train dataframe to 50%-50%
train_df = pd.concat([
    train_df.query('toxic==1'),
    train_df.query('toxic==0').sample(sum(train_df.toxic),random_state=42)
])
# shufle it just to make sure
train_df = train_df.sample(frac=1, random_state = 42)

In [None]:
%%time

def regular_encode3(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])
    

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
X_train = regular_encode3(train_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_val = regular_encode3(val_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_test = regular_encode3(test_df.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train_df.toxic.values.reshape(-1,1)
y_val = val_df.toxic.values.reshape(-1,1)

In [None]:
def create_dist_dataset1(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset
    
    
train_dist_dataset = create_dist_dataset1(X_train, y_train, True)
val_dist_dataset   = create_dist_dataset1(X_val)
test_dist_dataset  = create_dist_dataset1(X_test)

In [None]:
%%time

def create_model_and_optimizer3():
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(PRETRAINED_MODEL)                
        model = build_model3(transformer_layer)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)
    return model, optimizer


def build_model3(transformer):
    inp = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    # Huggingface transformers have multiple outputs, embeddings are the first one
    # let's slice out the first position, the paper says its not worse than pooling
    x = transformer(inp)[0][:, 0, :]  
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[inp], outputs=[out])
    
    return model


model, optimizer = create_model_and_optimizer3()


In [None]:
def define_losses_and_metrics():
    with strategy.scope():
        loss_object = tf.keras.losses.BinaryCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=False)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_batch_size)
            return loss

        train_accuracy_metric = tf.keras.metrics.AUC(name='training_AUC')

    return compute_loss, train_accuracy_metric



def train(train_dist_dataset, val_dist_dataset=None, y_val=None,
          total_steps=5000, validate_every=500):
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:
        distributed_train_step(tensor) 
        step+=1

        if (step % validate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_accuracy_metric.result().numpy()
            print("Step %d, train AUC: %.5f" % (step, train_metric))   
            
            ### Test loop with exact AUC ###
            if val_dist_dataset:
                val_metric = roc_auc_score(y_val, predict(val_dist_dataset))
                print("     validation AUC: %.5f" %  val_metric)   

            ### Reset (train) metrics ###
            train_accuracy_metric.reset_states()
            
        if step  == total_steps:
            break



@tf.function
def distributed_train_step(data):
    strategy.experimental_run_v2(train_step, args=(data,))

def train_step(inputs):
    features, labels = inputs

    with tf.GradientTape() as tape:
        predictions = model(features, training=True)
        loss = compute_loss(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_accuracy_metric.update_state(labels, predictions)




def predict(dataset):  
    predictions = []
    for tensor in dataset:
        predictions.append(distributed_prediction_step(tensor))
    ### stack replicas and batches
    predictions = np.vstack(list(map(np.vstack,predictions)))
    return predictions

@tf.function
def distributed_prediction_step(data):
    predictions = strategy.experimental_run_v2(prediction_step, args=(data,))
    return strategy.experimental_local_results(predictions)

def prediction_step(inputs):
    features = inputs  # note datasets used in prediction do not have labels
    predictions = model(features, training=False)
    return predictions


compute_loss, train_accuracy_metric = define_losses_and_metrics()

In [None]:
%%time
train(train_dist_dataset, val_dist_dataset, y_val,
      TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)

In [None]:
%%time
# make a new dataset for training with the validation data 
# with targets, shuffling and repeating
val_dist_dataset_4_training = create_dist_dataset1(X_val, y_val, training=True)

# train again
train(val_dist_dataset_4_training,
      total_steps = TOTAL_STEPS_STAGE2, 
      validate_every = VALIDATE_EVERY_STAGE2)  # not validating but printing now

In [None]:
%%time
sub_df['toxic'] = predict(test_dist_dataset)[:,0]
sub_df.to_csv('submission5.csv', index=False)

In [None]:
submission1 = pd.read_csv('/kaggle/output/submission1.csv')
submission2 = pd.read_csv('/kaggle/output/submission2.csv')
submission3 = pd.read_csv('/kaggle/output/submission3.csv')
submission4 = pd.read_csv('/kaggle/output/submission5.csv')

In [None]:
submission1['toxic'] = (submission1['toxic'])*.312 + (submission2['toxic'])*.198 + (submission3['toxic'])*.130 + (submission4['toxic'])*.360
submission1['toxic'].hist(bins=100)

In [None]:
submission1.to_csv('submission.csv', index=False)
submission1.head()