#### Hardware checks

##### checking torch

In [None]:
import torch
print(torch.__version__)  # Outputs the installed PyTorch version
print(torch.cuda.is_available())  # Checks GPU availability


2.5.1+cu121
True


##### checking tensorflow

In [None]:
import tensorflow as tf
print(tf.__version__)  # Outputs the installed TensorFlow version
print(tf.config.list_physical_devices('GPU'))  # Checks GPU availability


2.17.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


If your GPU supports both, please use pytorch, because its beginners friendly and most of hugging face tutorials written based on pytorch. I am going to use pytorch tensors in this notebook










## Imports

In [2]:
from IPython.display import clear_output

In [3]:
!pip install pyspellchecker
clear_output()

In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm
clear_output()

In [5]:
!pip install transformers datasets
clear_output()

In [6]:
# Importing libraries

import pandas as pd
import numpy as np

# Plotting libraries

import plotly.express as px

# Text processing libraries

import re
import string
from spellchecker import SpellChecker
import spacy

# Huggingface libraries

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction

# Machine learning libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, precision_score, recall_score

import torch

In [7]:
# Mounting google drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
# Importing dataset

df = pd.read_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/Resume.csv")

## EDA

In [None]:
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:
df.shape

(2484, 4)

In [None]:
df.columns

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')

In [None]:
# Category distribution

class_counts = df["Category"].value_counts().reset_index()
class_counts.columns = ["Category", "Count"]

fig = px.bar(class_counts, x="Category", y="Count", title="Category Distribution", color = "Category")
fig.show()

## Handling missing values

In [None]:
# There is missing value at id=12632728

# [it is not showing with df.isnull() method. because it empty string rather than missing value. I can only detect during the training process at the end]

df[df["ID"]==12632728]

Unnamed: 0,ID,Resume_str,Resume_html,Category
656,12632728,,"<div class=""fontsize fontface vmargins hmargin...",BUSINESS-DEVELOPMENT


In [None]:
# Fill the missing value by extracting resume text from the feature "Resume_html"

from bs4 import BeautifulSoup

html_content = df[df["ID"]==12632728]["Resume_html"].values[0]

# Create a BeautifulSoup object

soup = BeautifulSoup(html_content, 'html.parser')

# Extract all text from the HTML, including nested elements

all_text = soup.get_text(separator='\n')  # Use a separator for better readability

all_text


' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n '

In [None]:
# Removing row with id=12632728

df = df[df['ID'] != 12632728].copy()

In [None]:
# Dropping "Resume_html" and "ID" which is not needed for fine tuning

df.drop(labels=["Resume_html", "ID"], axis = 1, inplace = True)

## Text preprocessing

In [None]:

preprocessing_steps = []

# Remove HTML tags

def remove_html_tag(text):
    cleaned_text = re.sub(r"<.*?>","", text)
    return cleaned_text

preprocessing_steps.append(remove_html_tag)


# Remove URL

def remove_url(text):
    url_pattern = r"https?://\S+|www\.\S+|\S+\.\S{2,}"
    cleaned_text = re.sub(url_pattern,"", text)
    return cleaned_text

preprocessing_steps.append(remove_url)


# Remove Punctuation

def remove_punctuation(text):
    translator = str.maketrans("","", string.punctuation)
    return text.translate(translator)

preprocessing_steps.append(remove_punctuation)


# Lower case - comment lower_case function in case of fine-tuning the cased models. ex: bert-base-cased

def lower_case(text):
    return text.lower().strip()


preprocessing_steps.append(lower_case)


# Remove Unicodes - only applicable for english language. Because other language letters represented as unicodes.

def remove_unicode(text):
    unicode_pattern = str.maketrans("","","\xa0")
    return text.translate(unicode_pattern)

preprocessing_steps.append(remove_unicode)



# Remove Escape sequences (\n, \t, \r)

def remove_esc_seq(text):
    cleaned_text = re.sub(r"\\[nt\r]"," ",text)
    return cleaned_text

preprocessing_steps.append(remove_esc_seq)


# Remove Stop words using spacy

spacy.prefer_gpu() # using GPU if available. may reduce the run time.

nlp = spacy.load("en_core_web_sm")
def remove_stop_words(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_stop])

preprocessing_steps.append(remove_stop_words)


# Remove irrelevant white spaces

def remove_irrelevant_white_spaces(text):
    text = str(text)
    cleaned_text = re.sub(r"\s+"," ",text)
    return cleaned_text

preprocessing_steps.append(remove_irrelevant_white_spaces)


In [None]:
for step in preprocessing_steps:
  df["Resume_str"] = df["Resume_str"].apply(step)

In [None]:
# Renaming columns from ["Resume_str", "Category"] to ["text","label"]

df.columns = ["text","label"]

"""
Removing stop words can take some time(8 minutes with colab t4 runtime). So I am saving the preprocessed data to csv file. we can import the csv as dataframe for further process.
"""

#df.to_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/resume_preprocessed.csv", index = False)

'\nRemoving stop words can take some time(8 minutes with colab t4 runtime). So I am saving the preprocessed data to csv file. we can import the csv as dataframe for further process.\n'

## Fine tuning the bert-base-uncased model

### data preparation for fine tuning

In [None]:
# Importing preprocessed data

df = pd.read_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/resume_preprocessed.csv")

In [None]:
df.head()

Unnamed: 0,text,label
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [None]:
# label count

n_classes = len(df["label"].unique())

#### label preparation

In [None]:
# One-Hot Encoding

encoder = OneHotEncoder(sparse_output=False)

# Saving the labels for decoding the classification

encoding_labels = list(encoder.fit(df[["label"]]).categories_[0])

# Encoding the labels

encoded_labels = encoder.fit_transform(df[["label"]])
encoded_labels = encoded_labels.tolist()
df["labels"] = encoded_labels



In [None]:
# target class

print(f"Total number of classes: {n_classes}")
print(f"Target classes: {encoding_labels}")

Total number of classes: 24
Target classes: ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


In [None]:
# Calculate the average number of tokens. bert-base-uncased supports 512 tokens in single input

df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
average_word_count = df['word_count'].mean()
df.drop(labels=["word_count"], axis = 1, inplace = True)

print(f"Average number of text': {average_word_count}")

Average number of text': 811.652436568667


In [None]:
# Split the dataset into train (70%) and temp (30%)

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

# Split the temp dataset into test (20%) and validation (10%)

val_df, test_df = train_test_split(temp_df, test_size=1/3, random_state=42, stratify=temp_df['label'])

#  Convert DataFrames to Hugging Face Datasets

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# Removing unwanted columns

train_dataset = train_dataset.remove_columns(["__index_level_0__"])
test_dataset = test_dataset.remove_columns(["__index_level_0__"])
val_dataset = val_dataset.remove_columns(["__index_level_0__"])

# Combine into a DatasetDict

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})



In [None]:
# Removing "label"

dataset_dict = dataset_dict.remove_columns(["label"])

In [None]:
# Check the split sizes of fine tuning data

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1738
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 496
    })
})


In [None]:
# Check the split sizes of testing data

print(test_dataset)


# Saving the test data for inference model testing

test_dataframe = test_dataset.to_pandas()
test_dataframe.to_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/test_data.csv", index = False)

Dataset({
    features: ['text', 'label', 'labels'],
    num_rows: 249
})


## Model fine-tuning

In [None]:
# Importing Tokenizer and model from hugging face

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                      problem_type="multi_label_classification",
                                                      num_labels=n_classes)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**You might see this message.**

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

**I am using HF_TOKEN thats why you dont see in above cell output. Its your preference to be honest**

In [None]:
# Tokenizer

def tokenizer_helper(data_point):

  return tokenizer(data_point["text"], truncation=True, padding="max_length", return_tensors="pt")


In [None]:
# Tokenization

dataset_tokenized = dataset_dict.map(tokenizer_helper, batched = True)

Map:   0%|          | 0/1738 [00:00<?, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

In [None]:
dataset_tokenized = dataset_tokenized.remove_columns("text")

In [None]:
# Preparing training arguments

training_args = TrainingArguments(
    run_name="bert-finetuned-resume-classification",
    output_dir='./results',           # Directory for saving model checkpoints and logs
    eval_strategy="epoch",     # Evaluate the model every N steps
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=5,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for regularization
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="f1")


In [None]:
# Metrics

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
# Trainer

trainer = Trainer(
    model,
    args = training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.174483,0.0,0.5,0.0
2,No log,0.161599,0.0,0.5,0.0
3,0.206400,0.138643,0.0,0.5,0.0
4,0.206400,0.122349,0.0,0.5,0.0
5,0.134100,0.117229,0.004024,0.501008,0.002016


TrainOutput(global_step=1090, training_loss=0.16592022432099787, metrics={'train_runtime': 224.5244, 'train_samples_per_second': 38.704, 'train_steps_per_second': 4.855, 'total_flos': 2286886708592640.0, 'train_loss': 0.16592022432099787, 'epoch': 5.0})

In [None]:
# Saving the model

model.save_pretrained("bert-resume-classification")

In [None]:
# Saving the tokenizer

tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

you can use this fine-tuned model at :https://huggingface.co/Naandhu/bert-resume-classifier

## Evaluation with testing data

In [None]:
# Importing test data

test = pd.read_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/test_data.csv")

In [None]:
# Importing Tokenizer and model from hugging face

model_name = "Naandhu/bert-resume-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
def classify(text):
  data = tokenizer(text, truncation=True, padding="max_length", return_tensors="pt")
  result = model(**data)
  logits = result.logits
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits.squeeze().cpu())
  temp = probs.sort()
  return temp[-1][-1].item()

In [None]:
# class2id and id2class - helps to map between encoded classes and string classes

id2class = {i:c for i,c in enumerate(encoding_labels)}
class2id = {c:i for i,c in enumerate(encoding_labels)}

In [None]:
test["y_test"] = test["label"].map(class2id)
test["y_pred"] = test["text"].apply(classify)


In [None]:
test["y_pred_class"] = test["y_pred"].map(id2class)


In [None]:
test.head(5)

Unnamed: 0,text,label,labels,y_test,y_pred,y_pred_class
0,client advocate escalation specialist summary ...,ADVOCATE,[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,1,1,ADVOCATE
1,park operations hostess summary highly experie...,BANKING,[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. ...,7,7,BANKING
2,cs general construction worker summary hardwor...,CONSTRUCTION,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. ...,11,11,CONSTRUCTION
3,video director east coast video enterprise bra...,ARTS,[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,4,17,FITNESS
4,tutoring consultant executive profile motivate...,ARTS,[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,4,12,CONSULTANT


### Basic Evaluation

In [None]:
# Basic Metrics

classes = test["label"].unique()
y_test = test["y_test"]
y_pred = test["y_pred"]
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')
f1_micro = f1_score(y_test, y_pred, average='micro')

print("\nBasic Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro Precision: {precision_macro:.4f}")
print(f"Macro Recall: {recall_macro:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")
print(f"Micro Precision: {precision_micro:.4f}")
print(f"Micro Recall: {recall_micro:.4f}")
print(f"Micro F1 Score: {f1_micro:.4f}")

# 3. Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names= encoding_labels, zero_division=0))




Basic Metrics:
Accuracy: 0.8032
Macro Precision: 0.7279
Macro Recall: 0.7257
Macro F1 Score: 0.6946
Micro Precision: 0.8032
Micro Recall: 0.8032
Micro F1 Score: 0.8032

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.86      1.00      0.92        12
              ADVOCATE       0.90      0.75      0.82        12
           AGRICULTURE       1.00      0.17      0.29         6
               APPAREL       1.00      0.30      0.46        10
                  ARTS       0.00      0.00      0.00        10
            AUTOMOBILE       0.00      0.00      0.00         3
              AVIATION       0.75      1.00      0.86        12
               BANKING       0.71      0.83      0.77        12
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       1.00      1.00      1.00        12
                  CHEF       0.86      1.00      0.92        12
          CONSTRUCTION       1.00     


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Class Distribution imbalance vs classifier's performance

In [None]:
# class
classes = encoding_labels

# Total number of classes
class_counts.set_index("Category", inplace = True)

# Calculate true predictions

true_predictions = test[test['y_test'] == test['y_pred']]['label'].value_counts().reindex(classes, fill_value =0)

# Calculate mispredictions
mispredictions = test['label'].value_counts() - true_predictions



In [None]:
plot_data = pd.concat([class_counts, true_predictions, mispredictions], axis=1)
plot_data.columns = ["Total_samples", "True_predictions", "Mispredictions"]

In [None]:
# Reset the index for Plotly compatibility
plot_data = plot_data.reset_index()
plot_data.rename(columns={'index': 'Class'}, inplace=True)

# Reshape for Plotly (melt the DataFrame)
plot_data_melted = plot_data.melt(id_vars='Class', var_name='Metric', value_name='Count')

# Plot using Plotly
fig = px.bar(plot_data_melted, x='Class', y='Count', color='Metric', barmode='group',
             title='Class Imbalance Effect on Predictions',
             labels={'Count': 'Number of Samples', 'Class': 'Classes'},
             height=600)

fig.update_layout(xaxis=dict(title="Classes"),
                  yaxis=dict(title="Number of Samples"),
                  legend=dict(title="Metric"))

fig.show()

Due to the limited number of samples in certain classes, coupled with a higher rate of mispredictions compared to true predictions, the model’s performance for these classes is suboptimal. To improve the model's overall accuracy and generalization, it is crucial to acquire more data from these underrepresented classes, ensuring a more balanced distribution and better predictive performance.

## References:

1. HuggingFace : https://huggingface.co/google-bert/bert-base-uncased
2. Bert base uncased fine tuning guide :https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
3. Computing metrics: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

## Revision branch  (setup) - updated preprocess and post process configuration



In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

model_name = "Naandhu/bert-resume-classifier"
branch_name = "setup"


model = AutoModelForSequenceClassification.from_pretrained(model_name, revision = branch_name, num_labels = 24)
tokenizer = AutoTokenizer.from_pretrained(model_name, revison= branch_name)
config = AutoConfig.from_pretrained(model_name, revision = branch_name)

# creating preprocess file and post_process funcitons

exec(config.preprocess_function)
exec(config.postprocess_function)

In [13]:
test_data = pd.read_csv("/content/drive/MyDrive/colab_shared/Datasets/jarvis-hiring-dataset/test_data.csv")

In [21]:
preprocessed_text = test_data["text"][5]
print(preprocessed_text[:200])
print(f"Number of tokens: {len(preprocessed_text.split(' '))}")

         CONSTRUCTION HELPER       Summary     Hardworking and   Experienced Construction Worker who is dependable,  reliable and  knowledgeable about the tools, materials and methods used in construc
Number of tokens: 992


In [22]:
preprocessed_text = preprocess(preprocessed_text)
print(preprocessed_text[:150])
print(f"Number of tokens: {len(preprocessed_text.split(' '))}")

 CONSTRUCTION HELPER Summary Hardworking Experienced Construction Worker dependable , reliable knowledgeable tools , materials methods construction . 
Number of tokens: 675


In [23]:
tokenized_text = tokenizer(preprocessed_text, truncation=True, padding="max_length", return_tensors="pt")

In [24]:
output = model(**tokenized_text)

In [25]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-3.3240, -3.1855, -3.3238, -3.0864, -2.7547, -3.4237, -2.4859, -3.0381,
         -3.6489, -2.6859, -3.1311, -0.8631, -3.1753, -2.6113, -3.0261, -2.3336,
         -3.1368, -2.9739, -2.8530, -2.8185, -3.1003, -2.8905, -3.0700, -3.1385]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [26]:
classes = post_process(output)

In [27]:
print(classes)

CONSTRUCTION


In [28]:
print(test_data["label"][5])

CONSTRUCTION
