# Import

In [1]:
#standard libraries
import ast
import math
import os
import random
import re
import shutil
import string
import wget
import mv
#third-party libraries
import datasets
from datasets import Dataset, load_dataset, load_from_disk
import kagglehub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,TrainerCallback,AutoModel,BertConfig as BertConfig

#tokenizers
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")




In [None]:
#download datasets
wget.download("https://huggingface.co/datasets/clairebarale/AsyLex/resolve/main/main_and_case_cover_all_entities_inferred.csv?download=true",out='raw_datasets')
wget.download("https://huggingface.co/datasets/clairebarale/AsyLex/resolve/main/outcome_train_test/test_dataset_gold.csv?download=true",out='raw_datasets')
wget.download("https://huggingface.co/datasets/clairebarale/AsyLex/resolve/main/outcome_train_test/train_dataset_silver.csv?download=true",out='raw_datasets')
wget.download("https://huggingface.co/datasets/clairebarale/AsyLex/resolve/main/determination_label_extracted_sentences.csv?download=true",out='raw_datasets')
wget.download("https://huggingface.co/datasets/clairebarale/AsyLex/resolve/main/cases_anonymized_txt_raw.tar.gz?download=true",out='raw_datasets')
!tar -xzf raw_datasets/cases_anonymized_txt_raw.tar.gz
shutil.move('cases_anonymised_raw_text','raw_datasets')

sent2_dataset_path = kagglehub.dataset_download("thedevastator/unlocking-the-human-perspective-on-movie-reviews")
shutil.move(sent2_dataset_path,os.getcwd()+'\\raw_datasets')
os.rename(os.getcwd()+'\\raw_datasets\\2',os.getcwd()+'\\raw_datasets\\sent2')

sent1_dataset_path = kagglehub.dataset_download("madhavkumarchoudhary/sentiment-prediction-on-movie-reviews")
shutil.move(sent1_dataset_path,os.getcwd()+'\\raw_datasets')
os.rename(os.getcwd()+'\\raw_datasets\\1',os.getcwd()+'\\raw_datasets\\sent1')

In [2]:
#CONST
RAW_TEXTS_FILE = 'raw_datasets/cases_anonymised_raw_text'
FULL_TEXTS_REFINED_FILE = 'intermediate_datasets/full_texts_refined.pkl'

DETERMINATIONS_FILE = 'raw_datasets/determination_label_extracted_sentences.csv'
DETERMINATIONS_REFINED_FILE= 'intermediate_datasets/determinaton_sentences_refined.pkl'

GOLD_OUTCOMES_FILE = 'raw_datasets/test_dataset_gold.csv'
GOLD_OUTCOMES_REFINED = 'intermediate_datasets/gold_decision_outcomes_refined.pkl'

SILVER_OUTCOMES_FILE = 'raw_datasets/train_dataset_silver.csv'
SILVER_OUTCOMES_REFINED_FILE = 'intermediate_datasets/silver_decision_outcomes_refined.pkl'

NORP_DATASET_FILE = 'raw_datasets/main_and_case_cover_all_entities_inferred.csv'
NORP_REFINED = 'intermediate_datasets/norp.pkl'

SENT2_RAW_DATASET = 'raw_datasets/sent2'

SENT1_RAW_DATASET = 'raw_datasets/sent1/train.csv'

labels_list = ['chinese', 'haitian', 'roma', 'muslim', 'nigerian', 'indian', 'christian']

class ListProcessingError(Exception):
    def __init__(self, message, error_code):
        super().__init__(message)
        self.error_code = error_code
        self.message = message
    def __str__(self):
        return f"{self.message} (Error Code: {self.error_code})"

class TextProcessor:
    def __init__(self,info):
        self.info = info
    def check_determination_in_text(self,det,txt):
        txt = txt.replace(" ","")
        txt = txt.lower()
        return all(s.replace(" ","").lower() in txt for s in det)
    def preprocess_text(self,text):
        text = text.lower()
        text = re.sub(r'\n','',text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.replace(r'\xa0', ' ')
        text = text.translate(str.maketrans('','',string.punctuation))
        return text

# DatasetHandler


In [3]:
class DatasetHandler:
    def __init__(self,info,text_processor,model):
        self.info = info
        self.text_processor = text_processor
        if model=='bert':
            self.tokenizer_name = 'BERT'
            self.start_token_id = 101
            self.end_token_id = 102
            self.tokenizer = bert_tokenizer
        elif model=='roberta':
            self.tokenizer_name = 'RoBERTa'
            self.start_token_id = 0
            self.end_token_id = 2
            self.tokenizer = roberta_tokenizer
        else:
            print('not valid model')

    def subset(self,dataset,subset_dimension):
        if not isinstance(dataset, pd.DataFrame):
            print(type(dataset))
            raise ValueError('it s not a pandas dataframe')
        n_rows = len(dataset)
        if subset_dimension <= 0:
            raise ValueError('subset_dimension must be greater than 0.')
        if subset_dimension < 1:
            subset_size = round(subset_dimension * n_rows)
        else:
            subset_size = int(subset_dimension)
        subset_size = min(subset_size, n_rows)  # Avoid going out of bounds
        return dataset.iloc[:subset_size, :]

    def split(self,legal_dataset):
        shuffled_dataset = legal_dataset.sample(frac=1,random_state=1).reset_index(drop=True)
        split_cutpoints = [0.8,0.9]
        n_tuples = shuffled_dataset.shape[0]
        split_values = [math.floor(x*n_tuples) for x in split_cutpoints]

        train_set = shuffled_dataset.iloc[0:split_values[0]].reset_index(inplace=False,drop=True)
        validation_set = shuffled_dataset.iloc[split_values[0]:split_values[1]].reset_index(inplace=False,drop=True)
        test_set = shuffled_dataset.iloc[split_values[1]:].reset_index(inplace=False,drop=True)
        return train_set,validation_set,test_set

    def create_dataset(self,dataset_name,paragraph_selection_strategy,subset_dimension):
        match paragraph_selection_strategy:
            case 'first':
                paragraph_selection_strategy = self._extract_first
            case 'last':
                paragraph_selection_strategy = self._extract_last
            case 'rand':
                paragraph_selection_strategy = self._extract_randomized_interval
            case 'cas':
                paragraph_selection_strategy = self._extract_random
            case '':
                pass
            case _:
                raise ValueError('unknown parag selection strategy')
        match dataset_name:
            case 'asylex-outcome':
                return self.create_outcome(paragraph_selection_strategy,subset_dimension)
            case 'asylex-norp':
                return self.create_norp(paragraph_selection_strategy,subset_dimension)
            case 'sentiment1':
                return self.create_sent1(subset_dimension)
            case 'sentiment2':
                return self.create_sent2(subset_dimension)
            case _:
                raise ValueError('dataset name not found')

    def create_outcome(self, paragraph_selection_strategy,subset_dimension):
        determinations = pd.read_pickle(DETERMINATIONS_REFINED_FILE)
        silver_outcomes = pd.read_pickle(SILVER_OUTCOMES_REFINED_FILE)
        full_texts = pd.read_pickle(FULL_TEXTS_REFINED_FILE)
        if subset_dimension != None:
            determinations = self.subset(determinations,subset_dimension)
            silver_outcomes = self.subset(silver_outcomes,subset_dimension)
            full_texts = self.subset(full_texts,subset_dimension)

        texts_sentences = pd.merge(full_texts,determinations,on='id',how='inner')
        texts_sentences =  texts_sentences[[self.text_processor.check_determination_in_text(x,y) for x, y in zip(texts_sentences['all_sentences'], texts_sentences['text'])]]
        texts_sentences = texts_sentences.groupby(['id', 'text']).agg(n_sentences=('all_sentences', 'count')).reset_index()
        max_indices = texts_sentences.groupby('id')['n_sentences'].idxmax()
        true_texts = texts_sentences.loc[max_indices].reset_index(drop=True)
        true_texts = true_texts.drop(['n_sentences'],axis=1)
        true_texts = pd.merge(true_texts,determinations,on='id',how='inner')
        true_texts = true_texts[[self.text_processor.check_determination_in_text(x,y) for x, y in zip(true_texts['all_sentences'], true_texts['text'])]] #use not in front of the check_det function to get the non matching ones
        true_texts = pd.merge(true_texts,silver_outcomes,on='id',how='inner')
        true_texts = true_texts[true_texts['decision_outcome']!=2].reset_index(drop=True)

        train_set,validation_set,test_set = self.split(true_texts)

        train_set = train_set.explode('first_sentence').reset_index(inplace=False,drop=True)
        test_set = test_set.explode('first_sentence').reset_index(inplace=False,drop=True)
        validation_set = validation_set.explode('first_sentence').reset_index(inplace=False,drop=True)

        def preprocess_row(row):
            tokenized = self.tokenizer(row["text"])
            if self.tokenizer_name != 'RoBERTa':
                tokenized.pop('token_type_ids')
            tokenized.pop('attention_mask')
            tokenized['input_ids'] = tokenized['input_ids'][1:-1]
            if self.tokenizer_name =='RoBERTa':
                tokenized["tokenized_determinations"]  = self.tokenizer(' '+row["first_sentence"])['input_ids'][1:-1]
            else:
                tokenized["tokenized_determinations"]  = self.tokenizer(row["first_sentence"])['input_ids'][1:-1]
            try:
                tokenized["segment"], tokenized["info"] = paragraph_selection_strategy(tokenized['input_ids'],tokenized['tokenized_determinations'],510)
            except ValueError:
                tokenized['segment'] = [0]
                tokenized['info'] = [0,0]
                tokenized['labels'] = -1
                tokenized['input_ids'] = [0]
                return tokenized

            tokenized["segment"] = [self.start_token_id] + tokenized["segment"] + [self.end_token_id]
            tokenized["labels"] = row["decision_outcome"]
            tokenized["all_sentences"] = row["all_sentences"]
            return tokenized
        random.seed(1)
        splits = {'train_set':train_set,'test_set':test_set,'validation_set':validation_set}
        for name in splits.keys():
            dataset = splits[name]
            dataset = (
                Dataset.from_pandas(dataset)
                .map(preprocess_row)
                .filter(lambda example: example["labels"] != -1)
            )
            if name != 'test_set':
                dataset = dataset.remove_columns(['id','text','first_sentence','decision_outcome','input_ids','tokenized_determinations','info','all_sentences'])
            if name == 'test_set':
                dataset = dataset.rename_column('input_ids', 'complete_input_ids')
            dataset = dataset.rename_column('segment', 'input_ids')

            dataset_length = dataset.shape[0]
            dataset = dataset.add_column('attention_mask', [[1] * 512] * dataset_length)
            if self.tokenizer_name != 'RoBERTa':
                dataset = dataset.add_column('token_type_ids', [[0] * 512] * dataset_length)

            splits[name] = dataset
        return splits

    def create_norp(self,paragraph_selection_method,subset_dimension):
        norp_dataset = pd.read_pickle(NORP_REFINED)
        silver_outcomes = pd.read_pickle(SILVER_OUTCOMES_REFINED_FILE)
        full_texts = pd.read_pickle(FULL_TEXTS_REFINED_FILE)
        if subset_dimension != None:
            norp_dataset = self.subset(norp_dataset,subset_dimension)
            silver_outcomes = self.subset(silver_outcomes,subset_dimension)
            full_texts = self.subset(full_texts,subset_dimension)

        norp_dataset['labels'] = norp_dataset['NORP'].apply(lambda x: labels_list.index(x))
        norp_dataset = norp_dataset.groupby(['id','NORP','decision_outcome','labels'])['sentence'].apply(lambda x: sum(x, [])).reset_index()
        norp_dataset = norp_dataset.drop_duplicates(subset='id',keep='first')
        true_texts = pd.merge(full_texts,norp_dataset,on='id',how='inner')
        true_texts =  true_texts[[self.text_processor.check_determination_in_text(x,y) for x, y in zip(true_texts['sentence'], true_texts['text'])]]
        true_texts = true_texts.groupby(['id', 'text']).agg(n_sentences=('sentence', 'count')).reset_index()
        max_indices = true_texts.groupby('id')['n_sentences'].idxmax()
        true_texts = true_texts.loc[max_indices].reset_index(drop=True)
        true_texts = true_texts.drop(['n_sentences'],axis=1)
        true_texts = pd.merge(true_texts,norp_dataset,on='id',how='inner') #change decision_outcome name
        true_texts = true_texts[[self.text_processor.check_determination_in_text(x,y) for x, y in zip(true_texts['sentence'], true_texts['text'])]]

        train_set,validation_set,test_set = self.split(true_texts)

        def preprocess_row(row):
            tokenized = self.tokenizer(row["text"])
            if self.tokenizer_name != 'RoBERTa':
                tokenized.pop('token_type_ids')
            tokenized.pop('attention_mask')
            tokenized['input_ids'] = tokenized['input_ids'][1:-1]
            if self.tokenizer_name != 'RoBERTa':
                tokenized["tokenized_sentence"]  = self.tokenizer(row["sentence"])['input_ids'][1:-1]
            else:
                tokenized["tokenized_sentence"]  = self.tokenizer(' '+row["sentence"])['input_ids'][1:-1]
            try:
                tokenized["segment"], tokenized["info"] = paragraph_selection_method(tokenized['input_ids'],tokenized['tokenized_sentence'],510)
            except ValueError:
                tokenized['segment'] = []
                tokenized['info'] = []
                tokenized['labels'] = -1
                return tokenized

            tokenized["segment"] = [self.start_token_id] + tokenized["segment"] + [self.end_token_id]
            tokenized["decision_outcome"] = row["decision_outcome"]
            return tokenized

        random.seed(1)
        splits = {'train_set':train_set,'test_set':test_set,'validation_set':validation_set}
        if set(test_set['NORP']) == set(train_set['NORP']) == set(validation_set['NORP']) == set(labels_list):
            print(labels_list)
        else:
            raise ValueError('labels are different from one set to another')
        for name in splits.keys():
            dataset = splits[name]
            dataset['all_sentences'] = dataset['sentence']
            dataset = dataset.explode('sentence').reset_index(inplace=False,drop=True)

            dataset = (
                Dataset.from_pandas(dataset)
                .map(preprocess_row)
                .filter(lambda example: example["labels"] != -1)
            )
            if name != 'test_set':
                dataset = dataset.remove_columns(['id','text','sentence','decision_outcome','input_ids','tokenized_sentence','info','all_sentences'])
            if name == 'test_set':
                dataset = dataset.rename_column('input_ids', 'complete_input_ids')

            dataset_length = dataset.shape[0]
            dataset = dataset.add_column('attention_mask', [[1] * 512] * dataset_length)
            if self.tokenizer_name != 'RoBERTa':
                dataset = dataset.add_column('token_type_ids', [[0] * 512] * dataset_length)

            dataset = dataset.rename_column('segment', 'input_ids')
            dataset = dataset.remove_columns('labels')
            dataset = dataset.add_column('labels',[labels_list.index(e) for e in dataset['NORP']])
            splits[name] = dataset
        return splits

    def create_sent1(self,subset_dimension):
        dataset = pd.read_csv(SENT1_RAW_DATASET)
        if subset_dimension != None:
            dataset = self.subset(dataset,subset_dimension)
        def preprocess_data(examples):
            sentiment_map = {"NEGATIVE":0,"POSITIVE":1}
            examples["labels"] = sentiment_map[examples["labels"]]
            examples["text"] = str(examples["text"])
            examples["text"] = self.text_processor.preprocess_text(examples["text"])
            tokenized_row = self.tokenizer(examples['text'],truncation=True,padding='max_length')
            tokenized_row['labels'] = examples['labels']
            return tokenized_row
        dataset = dataset[dataset['reviewText'].notnull()].reset_index(drop=True)
        dataset.drop([ 'isFrequentReviewer', 'reviewerName', 'movieid'],axis = 1,inplace = True)
        dataset = dataset.rename(columns = {'reviewText':'text','sentiment':'labels'})
        dataset = Dataset.from_pandas(dataset)
        tokenized_datasets = dataset.map(preprocess_data)
        #dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

        #split
        all_sets = tokenized_datasets.train_test_split(test_size = 0.2,seed = 42)
        train_set = all_sets["train"]
        val_test_set = all_sets["test"]
        val_test_set = val_test_set.train_test_split(test_size =0.5,seed = 43)
        validation_set = val_test_set['train']
        test_set = val_test_set['test']
        #subset
        train_set = train_set.select(range(8000))
        test_set = test_set.select(range(1000))
        validation_set = validation_set.select(range(1000))

        data_sets = {'train_set':train_set,'test_set':test_set,'validation_set':validation_set}
        path = 'datasets/roberta/sentiment1/'
        #path = './'
        processed_datasets = {}
        for name,dataset in data_sets.items():
            if name == 'train_set':
                dataset = dataset.remove_columns("text")
            # dataset.save_to_disk(path+name)
            processed_datasets[name] = dataset
        return processed_datasets

    def create_sent2(self,subset_dimension):
        train_set = pd.read_csv(SENT2_RAW_DATASET+'/train.csv')
        test_set = pd.read_csv(SENT2_RAW_DATASET+'/test.csv')
        validation_set = pd.read_csv(SENT2_RAW_DATASET+'/validation.csv')
        if subset_dimension != None:
            train_set = self.subset(train_set,subset_dimension)
            test_set = self.subset(test_set,subset_dimension)
            validation_set = self.subset(validation_set,subset_dimension)

        data_sets = {'train_set':train_set,'test_set':test_set,'validation_set':validation_set}
        def preprocess_list_of_text(list_of_text):
            return [self.text_processor.preprocess_text(text) for text in list_of_text]

        def string_to_list(input_string):
            input_string = re.sub(r'\n','',input_string)
            cleaned_string = input_string.strip("[]")
            return_list = []
            elem = ''
            starter = 'NOT PRESENT'
            for char in cleaned_string:
                if starter == 'NOT PRESENT':
                    if char != ' ':
                        starter = char
                    continue
                if char == starter:
                    starter = 'NOT PRESENT'
                    return_list.append(elem)
                    elem = ''
                    continue
                elem += char
            return return_list

        def preprocess_row(row):
            tokenized_row = self.tokenizer(row['text'],truncation=True,padding='max_length')
            tokenized_row['labels'] = row['labels']
            tokenized_row['all_sentences'] = row['all_sentences']
            tokenized_row['present_sentence'] = tok_sent_in_tok_text(row['all_sentences'] ,tokenized_row['input_ids'])
            return tokenized_row
        def tok_sent_in_tok_text(sentences,text_input_ids):
            if self.tokenizer_name == 'RoBERTa':
                return [s for s in sentences if is_sublist_in_list(self.tokenizer(' '+s)['input_ids'][1:-1],text_input_ids[1:-1])]
            else:
                return [s for s in sentences if is_sublist_in_list(self.tokenizer(s)['input_ids'][1:-1],text_input_ids[1:-1])]

        def is_sublist_in_list(sublist,list):
            for i in range(0,len(list)-len(sublist)+1):
                if list[i:i+len(sublist)] == sublist:
                    return True
            return False
        processed_datasets = {}
        for name,dataset in data_sets.items():
            dataset = dataset.rename(columns = {'review':'text','evidences':'all_sentences','label':'labels'})
            dataset['text'] = dataset['text'].apply(self.text_processor.preprocess_text)
            dataset['all_sentences'] = dataset['all_sentences'].apply(string_to_list).apply(preprocess_list_of_text)
            dataset = Dataset.from_pandas(dataset)
            dataset = dataset.map(preprocess_row)
            #dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
            # dataset.save_to_disk('intermediate_datasets/sentiment2_roberta_'+name)
            processed_datasets[name] = dataset
        return processed_datasets

    def import_paths_and_nlabels(self,dataset_name,model_name,par_sel_strat):
        if dataset_name not in ['asylex-outcome','asylex-norp','sentiment1','sentiment2']:
            raise ValueError('dataset name not found')
        if model_name not in ['bert','roberta']:
            raise ValueError('model name not found')
        if par_sel_strat not in ['first','last','rand','cas','']:
            raise ValueError('long_text technique not found')

        base_dataset_path = 'datasets/' + model_name + '/'
        base_model_path = 'Models/'  + model_name + '/'

        dataset_filename = {'asylex-norp':'norp_','asylex-outcome':'outcome_','sentiment1':'sentiment1','sentiment2':'sentiment2'}
        model_filename = {'bert' : 'BERT512-', 'roberta' : 'RoBERTa512-'}
        model_filename2 = {'asylex-norp':'norp_','asylex-outcome':'out_','sentiment1':'sentiment1','sentiment2':'sentiment2'}

        if dataset_name == 'asylex-norp':
            num_labels = 7
        else:
            num_labels = 2

        dataset_path = base_dataset_path+ dataset_name+ '/' + dataset_filename[dataset_name] + par_sel_strat + '_'
        model_path = base_model_path + dataset_name + '/' + model_filename[model_name] + model_filename2[dataset_name] +par_sel_strat
        return dataset_path, model_path, num_labels

    def read_dataset(self,path):
        exp_datasets = {'train_set': '','test_set': '', 'validation_set': ''}
        for name in exp_datasets.keys():
            exp_datasets[name] = datasets.load_from_disk(path+name)
        return exp_datasets

    def write_dataset(self,path,exp_datasets):
        for name,dataset in exp_datasets.items():
            if os.path.exists(path):
                shutil.rmtree(path)
            dataset.save_to_disk(path+name)

    def compare_datasets(self,exp_datasets,exp_datasets2):
        for name in ['train_set','validation_set','test_set']:
            print(name)
            if exp_datasets[name].column_names != exp_datasets2[name].column_names:
                print('different columns')#raise ValueError()
            for col in exp_datasets2[name].column_names:
                print(exp_datasets[name][col] == exp_datasets2[name][col])
            print()

    def _extract_randomized_interval(self,my_list, my_sublist, l):
        # Converti la my_list e la my_sublist in stringhe per usare str.find
        my_list_str = ','.join(map(str, my_list))
        my_sublist_str = ','.join(map(str, my_sublist))

        # Trova l'indice di inizio della my_sublist
        sublist_index = my_list_str.find(my_sublist_str)

        if sublist_index == -1:
            raise ValueError("my_sublist non trovata nella my_list principale.")
        if len(my_list) < l:
            raise ValueError("my_list più breve di interval length")

        # Calcola l'indice di inizio e fine nella my_list originale
        my_sublist_start_index = my_list_str[:sublist_index].count(',')

        my_sublist_end_index = my_sublist_start_index + len(my_sublist)

        # Calcola gli indici per l'intervallo di k elementi
        start_interval = max(0, my_sublist_end_index - l)
        end_interval = min(len(my_list), my_sublist_start_index + l)

        # print(start_interval)
        # print(end_interval-l+1)
        actual_cutpoint = random.sample(range(start_interval,end_interval-l+1),1)[0]
        # for a in range(start_interval,end_interval-l+1):
        #     intervallo = my_list[a:a+l]
        #     print(intervallo)
        # Estrai l'intervallo
        subsection = my_list[actual_cutpoint:actual_cutpoint+l]

        return subsection, [actual_cutpoint,my_sublist_start_index - actual_cutpoint] #the second value of the list is the position in the sublist relative to the subsection
    def _extract_first(self,my_list, my_sublist, l):
        if len(my_list) < l:
            raise ValueError("my_list più breve di interval length")
        my_list = my_list[0:l]
        return my_list, [0,l]
    def _extract_last(self,my_list,my_sublist, l):
        if len(my_list) < l:
            raise ValueError("my_list più breve di interval length")
        length_list = len(my_list)
        my_list = my_list[-l:]
        return my_list, [length_list-l,length_list]
    def _extract_random(self,my_list,my_sublist,l):
        if len(my_list) < l:
            raise ValueError("my_list più breve di interval length")
        actual_cutpoint = random.sample(range(0,len(my_list)-l+1),1)[0]
        my_list = my_list[actual_cutpoint:actual_cutpoint+l]
        return my_list, [actual_cutpoint,actual_cutpoint+l]

# Asylex cleaner


In [None]:
class AsyLexCleaner:
    def __init__(self,text_processor):
        self.text_processor = text_processor

    def _legal_dataframe_creator(self,folder_path):
        files = sorted(os.listdir(folder_path))
        data = []
        for filename in files:
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    data.append({'doc_id': filename.split('canlii')[-1].replace('.txt', ''), 'text': content})
        df = pd.DataFrame(data)
        return df

    def full_texts_creator(self, path = RAW_TEXTS_FILE):
        full_texts = self._legal_dataframe_creator(path)
        full_texts = full_texts.rename(columns={'doc_id':'id'})
        full_texts['id'] = full_texts['id'].apply(lambda x: int(x))
        full_texts['text'] = full_texts['text'].apply(self.text_processor.preprocess_text)
        return full_texts

    def determinations_creator(self, path = DETERMINATIONS_FILE):
        determinations = pd.read_csv(path, sep=';')
        determinations = determinations.rename(columns={'decisionID':'id'})
        determinations['id'] = pd.to_numeric(determinations['id'], errors='coerce')
        determinations = determinations[determinations['id'].notna()]
        determinations['id'] = determinations['id'].astype(int)
        determinations = determinations.rename(columns={'extracted_sentences_determination':'sentence'})
        determinations['sentence'] = determinations['sentence'].apply(lambda x: ast.literal_eval(x))
        determinations = determinations.groupby('id')['sentence'].apply(lambda x: sum(x, [])).reset_index()
        determinations['first_sentence'] = determinations['sentence'].apply(lambda x: [self.text_processor.preprocess_text(x[0])]) #take only the first so we don't have doubles in the next part
        determinations['all_sentences'] = determinations['sentence'].apply(lambda x: [self.text_processor.preprocess_text(s) for s in x]) #all the sentences
        determinations.drop('sentence',axis=1,inplace=True)
        return determinations

    def gold_outcomes_creator(self,path = GOLD_OUTCOMES_FILE):
        gold_outcomes = pd.read_csv(path, sep=';')
        present_data = gold_outcomes['decisionID'].notna()
        gold_outcomes = gold_outcomes[present_data]
        gold_outcomes['decisionID'] = gold_outcomes['decisionID'].astype(int)
        gold_outcomes = gold_outcomes.rename(columns={'decisionID':'id'})
        gold_outcomes.to_pickle(GOLD_OUTCOMES_REFINED)
        gold_outcomes['decision_outcome'].dtype
        return gold_outcomes

    def silver_outcomes_creator(self,path = SILVER_OUTCOMES_FILE):
        silver_outcomes = pd.read_csv(path, sep=';')
        silver_outcomes['decisionID'] = silver_outcomes['decisionID'].astype(int)
        silver_outcomes['decision_outcome'] = silver_outcomes['decision_outcome'].astype(int)
        silver_outcomes = silver_outcomes.rename(columns={'decisionID':'id'})
        return silver_outcomes

    def norp_creator(self, path = NORP_DATASET_FILE):
        main_and_case = pd.read_csv(path, sep = ';')
        norp_dataset = main_and_case[['decisionID','NORP','decision_outcome','Text']]
        norp_dataset = norp_dataset.rename(columns={'decisionID':'id'})
        norp_dataset = norp_dataset.rename(columns={'Text':'sentence'})

        norp_dataset = norp_dataset[norp_dataset['NORP'].notna()]
        norp_dataset = norp_dataset[norp_dataset['NORP'].isin(labels_list)]
        norp_dataset['labels'] = norp_dataset['NORP'].apply(lambda x:labels_list.index(x))
        norp_dataset = norp_dataset.reset_index(inplace=False,drop=True)
        norp_dataset['sentence'] = norp_dataset['sentence'].apply(self.text_processor.preprocess_text)
        norp_dataset['sentence'] = norp_dataset['sentence'].apply(lambda x : [x])
        return norp_dataset

    def read_dataset(self,path) -> pd.DataFrame:
        if path.endswith(".csv"):
            dataset = pd.read_csv(path, sep=';')
        else:
            dataset = pd.read_pickle(path)
        return dataset

    def write_dataset(self,path,dataset):
        dataset.to_pickle(path)
    
    def create_all_intermediate(self):
        self.write_dataset(FULL_TEXTS_REFINED_FILE,self.full_texts_creator())
        self.write_dataset(DETERMINATIONS_REFINED_FILE,self.determinations_creator())
        self.write_dataset(GOLD_OUTCOMES_REFINED,self.gold_outcomes_creator())
        self.write_dataset(SILVER_OUTCOMES_REFINED_FILE,self.silver_outcomes_creator())
        self.write_dataset(NORP_REFINED,self.norp_creator())
        