In [1]:
"""import csv
import json
import pandas as pd
from collections import defaultdict"""

'import csv\nimport json\nimport pandas as pd\nfrom collections import defaultdict'

In [2]:
import csv
import json
import pandas as pd
from collections import defaultdict
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
class BuildDataset:
    
    # Build the training set for training the model
    def __init__(self, raw_trainingset_path):
        self.full_dataset = raw_trainingset_path
        self.foundation_model_name = "distilbert/distilbert-base-uncased"
        # vectorize text using embeddings
        self.tokenizer = AutoTokenizer.from_pretrained(self.foundation_model_name)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        #self.topic = topic
        
    
    def raw_data_to_dict(self, csv_delimiter=None, csvFilePath=None):
            # create a dataset dictionary
            data = {}            
            # Open a csv reader called DictReader
            if csvFilePath is None: csvFilePath=self.full_dataset
            with open(csvFilePath, encoding='utf-8-sig') as csvf:
                if csv_delimiter == None:
                    csvReader = csv.DictReader(csvf)#, delimiter=delimiter)#='\t')
                else:
                    csvReader = csv.DictReader(csvf, delimiter=csv_delimiter)#='\t')
                for key, rows in enumerate(csvReader):                    
                    data[key] = {
                        "Topic":rows["Topic"].strip().lower(),
                        "One-line summary":rows["One-line summary"],
                        "Full summary":rows["Full summary of comment"]
                    }
            return data
    def topic_sample_size(self, data):
        # Count the number of occurences of all the topics
        data_check = defaultdict(int)
        for i, rows in data.items():
            data_check[rows['Topic'].lower().strip()] +=1
        return data_check


    def create_dataset(self, topic, data,number_of_pos_label, percentage_pos_label=100, percentage_neg_label=100):
        # create trainset, testset and evaluationset
        sampleset = []
        evalset = []
        data_check = defaultdict(int)
        counter_for_neg_label = 0
        counter_for_pos_label = 0
        pos_size = int((percentage_pos_label/100)*number_of_pos_label)
        neg_size = int((percentage_neg_label/100)*number_of_pos_label)
        for i, rows in data.items():
            data_check[rows['Topic'].strip().lower()] +=1

            if rows["Topic"].strip().lower().__eq__(topic.strip().lower()) and counter_for_pos_label<=pos_size:
                sampleset.append(
                    {
                    "text": rows["Full summary"],
                    "label": 1, #governance
                    "summary": rows["One-line summary"]

                    }
                )
                counter_for_pos_label +=1
                
            elif counter_for_neg_label <= neg_size and not (rows["Topic"].strip().lower().__eq__(topic.strip().lower())):
                    sampleset.append(
                        {
                            "text": rows["Full summary"],
                            "label": 0, #others8
                            "summary": rows["One-line summary"]

                        }
                        )
                    counter_for_neg_label +=1
            else:
                    tag=0
                    if rows["Topic"].strip().lower().__eq__(topic.strip().lower()): tag=1 
                    
                    evalset.append(
                        {
                            "text": rows["Full summary"],
                            "label": tag, #others
                            "summary": rows["One-line summary"]
                        }
                    )
            #counter_for_testset +=1
        return sampleset, evalset, data_check
    
    def preprocess_function(self,examples):
        return self.tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
        
    def text_preprocessing(self,trainingset):
        # vectorize the dataset
        
        
        df = pd.DataFrame(trainingset)    
        df = df.drop(columns =['summary'])        
        # Convert the DataFrame into a Dataset
        from datasets import Dataset
        hugginface_data = Dataset.from_pandas(df)
        tokenized_hugginface_data = hugginface_data.map(self.preprocess_function, batched=True)
        split_tokenized_hugginface_data = tokenized_hugginface_data.train_test_split(test_size=0.1)
        
        return split_tokenized_hugginface_data
        
    


    
    

  from .autonotebook import tqdm as notebook_tqdm
2024-05-08 14:37:48.311006: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
class TrainModel:
    # Train the chosen foundation model
   
    def __init__(self, foundation_model_name = "distilbert/distilbert-base-uncased"):
        self.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
        self.label2id = {"NEGATIVE": 0, "POSITIVE": 1}
        self.num_labels = 2
        self.foundation_model_name = foundation_model_name
        self.accuracy = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        
    

    def train_model(self, folder, dataset, original_tokenizer, data_collator):
        model = AutoModelForSequenceClassification.from_pretrained(
                                                    self.foundation_model_name,
                                                    num_labels=self.num_labels,
                                                    id2label=self.id2label,
                                                    label2id=self.label2id
                                                    )
        training_args = TrainingArguments(
            output_dir=folder,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=20,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=False,
        )
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return self.accuracy.compute(predictions=predictions, references=labels)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset= dataset['train'],
            eval_dataset=dataset['test'],
            tokenizer=original_tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
        trainer.train()
            

In [4]:
from transformers import pipeline
class Inference: 
    
    # inference the trained model   
    def __init__(self, model_path):
        self.model_path = model_path
    def inference_pipeline(self, excerpt, max_length=512):
        pipe = pipeline("text-classification", self.model_path, max_length=max_length, truncation=True)
        return pipe(excerpt)

    

In [5]:
# Evaluate the result of the training
import pandas as pd 
from datasets import Dataset 
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
#import Inference 
from transformers.pipelines.pt_utils import KeyDataset

class EvaluateResult:
  
    def __init__(self, val_df, validation_data_size, model_path):
        self.df = val_df[:validation_data_size]
        self.model_path = model_path
        
    def get_pred_n_ref(self):
        infer = Inference(self.model_path)
        self.df = self.df.drop(columns =['summary'])
        hugginface_data = Dataset.from_pandas(self.df)
        prediction=[]
        for out in infer.inference_pipeline(excerpt = KeyDataset(hugginface_data, "text")):
            if out['label'].__eq__('NEGATIVE'):
                prediction.append(0)
            else:
                prediction.append(1)
        ref = self.df['label'].tolist()
        return prediction, ref
    
    def eval_calculation(self):
        prediction, ref = self.get_pred_n_ref()
        y_true = np.array(ref)
        y_pred = np.array(prediction)
        return precision_recall_fscore_support(y_true, y_pred, average='micro')

        

In [6]:
#ETL pipeline

In [7]:
# create training set, evaluation set and validation set
topic = "Governance and accountability"
dataset_builder = BuildDataset("/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/Llama_tutorial/data/consultation2.csv")
full_data_dict = dataset_builder.raw_data_to_dict(csv_delimiter = '\t')
all_topic_count = dataset_builder.topic_sample_size(full_data_dict)
number_of_pos_label = all_topic_count[topic.strip().lower()]
topic_sampleset, topic_evalset, _ = dataset_builder.create_dataset(topic=topic, data=full_data_dict,number_of_pos_label=int(number_of_pos_label))



In [8]:
# Vectorize
split_tokenized_hugginface_data = dataset_builder.text_preprocessing(topic_sampleset)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 457/457 [00:00<00:00, 3652.41 examples/s]


In [9]:
#train model
folder ='/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance'
dataset=split_tokenized_hugginface_data
original_tokenizer= dataset_builder.tokenizer
data_collator= dataset_builder.data_collator
foundation_model_name = "distilbert/distilbert-base-uncased"
bert_trainer = TrainModel(foundation_model_name)
bert_trainer.train_model(folder, dataset, original_tokenizer, data_collator)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.621262,0.717391,0.763636,0.724138,0.807692
2,No log,0.592432,0.76087,0.807018,0.741935,0.884615
3,No log,0.653311,0.76087,0.8,0.758621,0.846154
4,No log,0.668924,0.782609,0.814815,0.785714,0.846154
5,No log,0.697973,0.782609,0.807692,0.807692,0.807692
6,No log,0.826183,0.782609,0.814815,0.785714,0.846154
7,No log,0.999364,0.717391,0.77193,0.709677,0.846154
8,No log,0.82858,0.782609,0.807692,0.807692,0.807692
9,No log,1.091977,0.73913,0.785714,0.733333,0.846154
10,No log,0.918313,0.804348,0.823529,0.84,0.807692


Checkpoint destination directory /home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-26 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-52 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-78 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-104 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/azureuser/cloudfiles/code/Users/Omololu.

In [10]:
len(full_data_dict)

1895

In [11]:
len(topic_sampleset)

457

In [12]:
len(topic_evalset)

1438

In [13]:
df_te = pd.DataFrame.from_dict(topic_evalset) #topic_evalset

In [14]:
# evaluate model
val_df = df_te
validation_data_size = 200
model_path = "/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-208"
evaluator = EvaluateResult(val_df, validation_data_size, model_path)
evaluator.eval_calculation()


(0.8, 0.8, 0.8, None)

In [16]:
# Inference PDF
import csv
import json
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from tika import parser
from transformers import pipeline
class PdfELT:
    def __init__(self, pdf_path, model_path):
        self.pdf_path = pdf_path
        self.embeddings = HuggingFaceEmbeddings(model_name="avsolatorio/GIST-small-Embedding-v0")
        self.model_path = model_path

    def get_pdf(self):
        #get pdf from stored folder and parses it into pages

        filepath = self.pdf_path  # Replace with the path of your file
        parsed_document = parser.from_file(filepath)
        broken_pdf = parsed_document['content']
        #print(data)
        return broken_pdf


    def split_pdf(self,raw_txt,embedding=None):
        # Uses semantic chunking to split the paragraphs in the document
        if embedding is None: embedding= self.embeddings
        text_splitter = SemanticChunker(embedding)
        text_chunks = text_splitter.create_documents([raw_txt])
        #print(docs[0].page_content)
        len(text_chunks)
        
        return text_splitter, text_chunks
    
    def tag_pdf(self,text_chunks_dict, topic="governance and accountability"):
        dict_result = defaultdict(list)
        infer = Inference(self.model_path)
        class_name = topic
        for content, tag_list in text_chunks_dict.items(): 
            paragraph = content.replace("\n\n", "")
            class_result = infer.inference_pipeline(paragraph)[0]   #pipe(paragraph.replace("\n", ""))[0]
            #print(class_result)
            class_result.update({'class_name':class_name})   
            tag_list.append(class_result)
            dict_result[paragraph] = tag_list
        
        return dict_result
    


In [18]:
# Full topic classification ETL
from collections import defaultdict 
model_path = "/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/models/bert_model_governance/checkpoint-208"
pdf_path = "/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/Llama_tutorial/data/consultation2.csv"
empty_par_folder = "/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/results/empty_paragraph"
tagged_par_folder = "/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/results/tagged_paragraph"
etl = PdfELT(pdf_path, model_path)


In [19]:
# 1. pull pdf
pdf_pages = etl.get_pdf()

2024-05-08 14:49:36,759 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2024-05-08 14:49:37,445 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2024-05-08 14:49:37,799 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [20]:
# 2. split PDF into chunks
splitter, chunks = etl.split_pdf(pdf_pages)

In [21]:
chunks[0]

Document(page_content='\n\n\n\n\n\n\n\n\n\nTopic\tTopic (custom)\tOne-line summary\tFull summary of comment\nUser reporting and complaints (U2U and search) \t\tEvery service should have a dedicated reporting channel for fraud, because there is lots of fraud on a range of platforms. All services should have a dedicated channel for reporting fraud. We see and hear about a lot of fraud on small platforms, especially dating sights and job boards. Governance and accountability \tSegmentation \tThis mitigation should apply to all services, large and small. This is b/c some small services have been taken over by scammers / human traffickers \tThis mitigation should apply to all services, large and small. A community message board or website may be low risk. But some small dating sites, job boards, investment and eCommerce sites have been taken over by scammers and even human traffickers, in our experience. At least specific risk services like dating and job apps should have governance bodies)

In [22]:
# 3. Put chunks into json file and store in empty_paragraph folder
dict_chunks = defaultdict(list)
for par in chunks:
    dict_chunks[par.page_content]=[]
outfile = empty_par_folder + "/overview_gov_class.json"
with open(outfile, "w") as outfile: 
    json.dump(dict_chunks, outfile, indent = 4)

In [23]:
outfile

<_io.TextIOWrapper name='/home/azureuser/cloudfiles/code/Users/Omololu.Makinde/work/outputs/results/empty_paragraph/overview_gov_class.json' mode='w' encoding='UTF-8'>

In [24]:
# 4. Get chunks from json file
outfile = empty_par_folder + "/overview_gov_class.json"
with open(str(outfile), 'r') as empt_par:
    strored_chunk = json.load(empt_par)

In [25]:
# 5. Tag chunk using classifier
res = etl.tag_pdf(strored_chunk, topic="governance and accountability")

In [26]:
res

defaultdict(list,
            {'Topic\tTopic (custom)\tOne-line summary\tFull summary of comment\nUser reporting and complaints (U2U and search) \t\tEvery service should have a dedicated reporting channel for fraud, because there is lots of fraud on a range of platforms. All services should have a dedicated channel for reporting fraud. We see and hear about a lot of fraud on small platforms, especially dating sights and job boards. Governance and accountability \tSegmentation \tThis mitigation should apply to all services, large and small. This is b/c some small services have been taken over by scammers / human traffickers \tThis mitigation should apply to all services, large and small. A community message board or website may be low risk. But some small dating sites, job boards, investment and eCommerce sites have been taken over by scammers and even human traffickers, in our experience. At least specific risk services like dating and job apps should have governance bodies). Governanc

In [27]:
# 6. Put tagged chunks into jason file and store in tagged_paragraph folder
tagged_output = tagged_par_folder + "/overview_gov_class.json"
with open(tagged_output, "w") as tagged_par: 
    json.dump(res, tagged_par, indent = 4)


# 1. pull pdf
pdf_pages = etl.get_pdf()
# 2. split PDF into chunks
splitter, chunks = etl.split_pdf(pdf_pages)
# 3. Put chunks into json file and store in empty_paragraph folder
dict_chunks = defaultdict(list)
for par in chunks:
    dict_chunks[par].append('')
outfile = empty_par_folder + "/overview_gov_class.json"
with open(outfile, "w") as outfile: 
    json.dump(dict_chunks, outfile, indent = 4)


# 4. Get chunks from json file
with open(outfile) as json_file:
    strored_chunk = json.load(json_file)
# 5. Tag chunk using classifier
res = etl.tag_pdf(strored_chunk, topic="governance and accountability")

# 6. Put tagged chunks into jason file and store in tagged_paragraph folder
