# Initialisation

In [None]:
#!pip install torch transformers sentencepiece

## Imports

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoModelForCausalLM , AutoTokenizer, pipeline, RobertaTokenizer, RobertaForMaskedLM, AlbertTokenizer, AlbertModel, AlbertForMaskedLM
import logging
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from afinn import Afinn
from tqdm import tqdm
import unidecode
from time import sleep
logging.basicConfig(level=logging.INFO)# OPTIONAL

## MAC Settings

In [None]:
print(f"PyTorch version: {torch.__version__}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

## Definitions

In [None]:
IDENTITIES = 'identities'
ORIENTATION = 'orientation'
OTHER = 'other'
PRONOUNS = 'pronouns'
MASKBERT_ = '\[MASK\]'
MASKBERT= '[MASK]'
MASKROBERT = '<mask>'
TARGET = '<target>'
NOM = '<nom>'
POSDEP = '<pos_dep>'
ACC = '<acc>'
BE = '<be>'
QUEER = 'queer'
NONQUEER = 'non-queer'

#TEMPLATES
TEMPLATE_NOZZA = '../src/templates/template_nozza.csv'
TEMPLATE_NOZZA_COMPLETE = '../src/templates/template_nozza_complete.csv'
TEMPLATE_TOXIC2 = '../src/templates/template_toxic2.csv'
TEMPLATE_TOXIC2_COMPLETE = '../src/templates/template_toxic2_complete.csv'
TEMPLATE_TOXIC1 = '../src/templates/template_toxic1.csv'
TEMPLATE_TOXIC1_COMPLETE = '../src/templates/template_toxic1_complete.csv'
TEMPLATE_TOXIC1_CHUNK = '../src/templates/toxic1/template_toxic1'
TEMPLATE_TOXIC2_CHUNK = '../src/templates/toxic2/template_toxic2'

#IDENTITIES CSV
IDENTITIES_CSV = '../src/templates/queer_identities/identities.csv'
PRONOUNS_CSV = '../src/templates/queer_identities/pronouns.csv'

#MODELS
BERT_BASE = 'bert-base-uncased'
BERT_LARGE = 'bert-large-uncased'
ROBERTA_BASE = 'roberta-base'
ROBERTA_LARGE = 'roberta-large'
GPT2 = 'gpt2'

# Template Builder class

In [None]:
class TemplateBuilder():
    def __init__(self, template_path):
        self.data = []
        self.template_path = template_path
        self.template_file = pd.read_csv(template_path, sep=";")
        self.template_identities = pd.read_csv(IDENTITIES_CSV, sep=';')
        self.template_pronouns = pd.read_csv(PRONOUNS_CSV, sep=';')
        self.template_builder()
    

    def read_csv(self):
        if self.template_path == TEMPLATE_NOZZA:
            df = pd.read_csv(self.template_path, sep=";", dtype={'template': 'category', 'type': 'category'})
        else:
            if self.template_path == TEMPLATE_TOXIC1 or self.template_path == TEMPLATE_TOXIC2:
                df = pd.read_csv(self.template_path, sep=";", dtype={'template': 'category'})
        return df

    def template_builder(self):
        if(self.template_path == TEMPLATE_NOZZA):
            self.template_nozza()
        else:
            if (self.template_path == TEMPLATE_TOXIC1):
                self.template_toxic1()
            else: 
                self.template_toxic2()
        
    def template_nozza(self):
        dataList =[]
        for index,row in tqdm(self.template_file.iterrows(), total=self.template_file.shape[0], desc='Creating template', unit='sentences'):
            sentence = row.loc['template']
            for ind, r in self.template_identities.iterrows():
                _sentence = re.sub(TARGET, f"The {r.loc['identity']} person", sentence)
                _sentence = re.sub(BE, 'is', _sentence)
                data=[
                    _sentence, #new template
                    r.loc["identity"], #identity
                    r.loc["type"] #type identity
                ]
                dataList.append(data) 
            for ind, r in self.template_pronouns.iterrows():
                _sentence = re.sub(TARGET, r.loc["nom"], sentence)
                _sentence = re.sub(BE, r.loc["be"], _sentence)
                data=[
                    _sentence, #new template
                    r.loc["nom"], #identity
                    r.loc["type"], #type identity
                ]
                dataList.append(data) 
        data_df = pd.DataFrame(dataList, columns=["new_template", "identity", "type"])
        self.data = data_df
        display(data_df)
        data_df.to_csv(TEMPLATE_NOZZA_COMPLETE, sep=';')
        
    def template_toxic1(self):
        dataList =[]
        for index,row in tqdm(self.template_file.iterrows(), total=self.template_file.shape[0], desc='Creating template', unit='sentences'):
            sentence = row.loc['template']
            for ind, identity in self.template_identities.iterrows():
                _sentence = re.sub(TARGET, f"The {identity.loc['identity']} person", sentence)
                for id, pronouns in self.template_pronouns.iterrows():
                    __sentence = re.sub(NOM, pronouns.loc['nom'], _sentence)
                    __sentence = re.sub(BE, pronouns.loc['be'], __sentence)
                    data=[
                        #sentence, #template
                        __sentence, #new template
                        identity.loc["identity"], #identity
                        identity.loc["type"], #type identity
                        pronouns.loc["nom"], #pronouns nom
                        pronouns.loc["type"] #type pronouns
                    ]
                    dataList.append(data) 
            for ind, pronouns in self.template_pronouns.iterrows():
                _sentence = re.sub(TARGET, pronouns.loc["nom"], sentence)
                _sentence = re.sub(BE, pronouns.loc["be"], _sentence)
                _sentence = re.sub(NOM, pronouns.loc["nom"], _sentence)
                data=[
                    #sentence, #template
                    _sentence, #new template
                    pronouns.loc["nom"], #identity
                    pronouns.loc["type"], #type identity
                    pronouns.loc["nom"], #pronouns nom
                    pronouns.loc["type"] #type pronouns
                ]
                dataList.append(data) 
        data_df = pd.DataFrame(dataList, columns=["new_template", "identity", "type_identity", "pronoun", "pronouns_type"])
        data_df['new_template'] = data_df['new_template'].astype('category')
        data_df['identity'] = data_df['identity'].astype('category')
        data_df['type_identity'] = data_df['type_identity'].astype('category')
        data_df['pronoun'] = data_df['pronoun'].astype('category')
        data_df['pronouns_type'] = data_df['pronouns_type'].astype('category')
        self.data = data_df
        display(data_df)
        data_df.to_csv(TEMPLATE_TOXIC1_COMPLETE, sep=';')
        for i,chunk in tqdm(enumerate(pd.read_csv(TEMPLATE_TOXIC1_COMPLETE, chunksize=500000, sep=';',  dtype={'new_template':'category', 'identity': 'category', 'type_identity': 'category','pronoun': 'category', 'pronouns_type': 'category'})), total=20, desc='Creating chunks', unit='chunks'):
            chunk.to_csv(TEMPLATE_TOXIC1_CHUNK+'_chunk{}.csv'.format(i), index=False, sep=';')

    def template_toxic2(self):
        dataList =[]
        for index,row in tqdm(self.template_file.iterrows(), total=self.template_file.shape[0], desc='Creating template', unit='sentences'):
            sentence = row.loc['template']
            for ind, identity in self.template_identities.iterrows():
                _sentence = re.sub(TARGET, f"The {identity.loc['identity']} person", sentence)
                for id, pronouns in self.template_pronouns.iterrows():
                    __sentence = re.sub(POSDEP, pronouns.loc['pos_dep'], _sentence)
                    __sentence = re.sub(BE, pronouns.loc['be'], __sentence)
                    data=[
                        #sentence, #template
                        __sentence, #new template
                        identity.loc["identity"], #identity
                        identity.loc["type"], #type identity
                        pronouns.loc["nom"], #pronouns nom
                        pronouns.loc["type"] #type pronouns
                    ]
                    dataList.append(data) 
            for ind, pronouns in self.template_pronouns.iterrows():
                _sentence = re.sub(TARGET, pronouns.loc["nom"], sentence)
                _sentence = re.sub(BE, pronouns.loc["be"], _sentence)
                _sentence = re.sub(POSDEP, pronouns.loc['pos_dep'], _sentence)
                data=[
                    #sentence, #template
                    _sentence, #new template
                    pronouns.loc["nom"], #identity
                    pronouns.loc["type"], #type identity
                    pronouns.loc["nom"], #pronouns nom
                    pronouns.loc["type"] #type pronouns
                ]
                dataList.append(data) 
        data_df = pd.DataFrame(dataList, columns=["new_template", "identity", "type_identity", "pronoun", "pronouns_type"])
        data_df['new_template'] = data_df['new_template'].astype('category')
        data_df['identity'] = data_df['identity'].astype('category')
        data_df['type_identity'] = data_df['type_identity'].astype('category')
        data_df['pronoun'] = data_df['pronoun'].astype('category')
        data_df['pronouns_type'] = data_df['pronouns_type'].astype('category')
        self.data = data_df
        display(data_df)
        data_df.to_csv(TEMPLATE_TOXIC2_COMPLETE)
        for i,chunk in tqdm(enumerate(pd.read_csv(TEMPLATE_TOXIC2_COMPLETE, chunksize=500000, sep=';',  dtype={'new_template':'category', 'identity': 'category', 'type_identity': 'category','pronoun': 'category', 'pronouns_type': 'category'})), total=20, desc='Creating chunks', unit='chunks'):
            chunk.to_csv(TEMPLATE_TOXIC2_CHUNK+'_chunk{}.csv'.format(i), index=False, sep=';')

    

In [None]:
#Insert a template
builder1 = TemplateBuilder(TEMPLATE_TOXIC1)