# Get the original text data, import libs, and show some samples

In [1]:
!wget https://huggingface.co/datasets/bezirganyan/LUMA/resolve/main/text_data.tsv

--2024-11-26 13:04:15--  https://huggingface.co/datasets/bezirganyan/LUMA/resolve/main/text_data.tsv
Resolving huggingface.co (huggingface.co)... 3.167.112.45, 3.167.112.25, 3.167.112.96, ...
Connecting to huggingface.co (huggingface.co)|3.167.112.45|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/91/6d/916d91190e01032d29384546038721047bf54420deab1052a661200d4b695d3d/34fdb88e6c762deda6bc625f7eb51394a584ff9306a9d7753ab223859754e42a?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27text_data.tsv%3B+filename%3D%22text_data.tsv%22%3B&response-content-type=text%2Ftab-separated-values&Expires=1732885455&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjg4NTQ1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzkxLzZkLzkxNmQ5MTE5MGUwMTAzMmQyOTM4NDU0NjAzODcyMTA0N2JmNTQ0MjBkZWFiMTA1MmE2NjEyMDBkNGI2OTVkM2QvMzRmZGI4OGU2Yzc2MmRlZGE2YmM2MjVmN2ViNTEzOTRhNTg0ZmY5MzA2YTlk

In [6]:
import pandas as pd
import random
import os
from tqdm.notebook import tqdm

In [3]:
data = pd.read_csv('text_data.tsv', sep='\t')

In [4]:
print(f'lenght of data: {len(data)}')

lenght of data: 62875


In [5]:
idx = random.randint(0, len(data))
print(f'sample {idx} \nText: {data.loc[idx].text} \nLabel: {data.loc[idx].label}')

sample 36609 
Text: A rabbit is a fluffy and cuddly animal that lives in a burrow. It is known for its quick hopping, soft fur, and sweet teeth. Rabbits are often kept as pets. They eat carrots, lettuce, and other vegetables. 
Label: rabbit


# Testing some models

We first filter the biases from non biased

## Binary classifier

### Inference

In [7]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("d4data/bias-detection-model")
model = TFAutoModelForSequenceClassification.from_pretrained("d4data/bias-detection-model")

classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0) # cuda = 0,1 based on gpu availability

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at d4data/bias-detection-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [8]:
# test on one sample

output = classifier(data.loc[idx].text)
output[0]

In [9]:
data['text_length'] = data['text'].str.len()
data_sorted = data.sort_values(by='text_length')

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        return {
            'text': text,
        }

# Example usage
BATCH_SIZE = 32 # Adjust as needed

dataset = TextDataset(data_sorted)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
# Batching the input
i = 0
results = []
for batch in tqdm(dataloader):
    i += 1
    try:
        batch_results = classifier(batch['text'])
        results.extend(batch_results)
    except Exception as e:
        print(f"Error processing batch {i}: {e}")

  0%|          | 0/1965 [00:00<?, ?it/s]

In [19]:
data_sorted['results'] = results
data_sorted.to_csv('binary_classification_results.csv', index=False)

In [20]:
data_sorted

Unnamed: 0,text,label,text_length,results
17416,A bottle of water vapor suspended in the air,cloud,44,"{'label': 'Non-biased', 'score': 0.67796951532..."
18020,"A fox is quick and sly, it's a cunning animal.",fox,46,"{'label': 'Non-biased', 'score': 0.59829854965..."
46835,Plain clothes are comfortable to wear all day.,plain,46,"{'label': 'Biased', 'score': 0.6522274017333984}"
48252,We often gather around the table to eat and chat.,table,49,"{'label': 'Biased', 'score': 0.5837098956108093}"
4710,I'm planning on moving into a new house next m...,house,51,"{'label': 'Non-biased', 'score': 0.67790681123..."
...,...,...,...,...
17460,Clouds have traversed a storied path throughou...,cloud,1483,"{'label': 'Biased', 'score': 0.9771029949188232}"
19562,"Opuut: Castles, majestic structures steeped in...",castle,1486,"{'label': 'Biased', 'score': 0.9805306196212769}"
31717,"The frog, a beloved amphibian creature, has a ...",frog,1490,"{'label': 'Biased', 'score': 0.9920835494995117}"
17456,"Cloud, a wispy, ethereal aggregation, made of ...",cloud,1492,"{'label': 'Biased', 'score': 0.7810251712799072}"


### Or load the data

In [7]:
if os.path.exists('binary_classification_results.csv'):
    data_sorted = pd.read_csv('binary_classification_results.csv')
else:
  print('file not found, run the inference first')

Unnamed: 0,text,label,text_length,results
0,A bottle of water vapor suspended in the air,cloud,44,"{'label': 'Non-biased', 'score': 0.67796951532..."
1,"A fox is quick and sly, it's a cunning animal.",fox,46,"{'label': 'Non-biased', 'score': 0.59829854965..."
2,Plain clothes are comfortable to wear all day.,plain,46,"{'label': 'Biased', 'score': 0.6522274017333984}"
3,We often gather around the table to eat and chat.,table,49,"{'label': 'Biased', 'score': 0.5837098956108093}"
4,I'm planning on moving into a new house next m...,house,51,"{'label': 'Non-biased', 'score': 0.67790681123..."
...,...,...,...,...
62870,Clouds have traversed a storied path throughou...,cloud,1483,"{'label': 'Biased', 'score': 0.9771029949188232}"
62871,"Opuut: Castles, majestic structures steeped in...",castle,1486,"{'label': 'Biased', 'score': 0.9805306196212769}"
62872,"The frog, a beloved amphibian creature, has a ...",frog,1490,"{'label': 'Biased', 'score': 0.9920835494995117}"
62873,"Cloud, a wispy, ethereal aggregation, made of ...",cloud,1492,"{'label': 'Biased', 'score': 0.7810251712799072}"


## Multiclass classification

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="PriyaPatel/bias_identificaiton45")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
classifier(data.loc[idx].text)

[{'label': 'Biased', 'score': 0.9555160403251648}]

In [None]:
data.loc[idx].text

