In [None]:
import requests
import time

def get_dataset_rows(dataset, config, split, offset=0, limit=100):
    url = f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={offset}&limit={limit}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json().get("rows", [])
    else:
        print(f"Erreur HTTP {response.status_code}")
        return []

dataset = "cnamuangtoun/resume-job-description-fit"
config = "default"
split = "train"
limit = 100
offset = 0

all_rows = []

while True:
    rows = get_dataset_rows(dataset, config, split, offset, limit)
    if not rows:
        break  # Stop if no more data
    all_rows.extend(rows)
    print(f"Fetched {len(rows)} rows (total so far: {len(all_rows)})")
    offset += limit
    time.sleep(0.5)  # Respect API rate limits

print(f"Total rows collected: {len(all_rows)}")


Fetched 100 rows (total so far: 100)
Fetched 100 rows (total so far: 200)
Fetched 100 rows (total so far: 300)
Fetched 100 rows (total so far: 400)
Fetched 100 rows (total so far: 500)
Fetched 100 rows (total so far: 600)
Fetched 100 rows (total so far: 700)
Fetched 100 rows (total so far: 800)
Fetched 100 rows (total so far: 900)
Fetched 100 rows (total so far: 1000)
Fetched 100 rows (total so far: 1100)
Fetched 100 rows (total so far: 1200)
Fetched 100 rows (total so far: 1300)
Fetched 100 rows (total so far: 1400)
Fetched 100 rows (total so far: 1500)
Fetched 100 rows (total so far: 1600)
Fetched 100 rows (total so far: 1700)
Fetched 100 rows (total so far: 1800)
Fetched 100 rows (total so far: 1900)
Fetched 100 rows (total so far: 2000)
Fetched 100 rows (total so far: 2100)
Fetched 100 rows (total so far: 2200)
Fetched 100 rows (total so far: 2300)
Fetched 100 rows (total so far: 2400)
Fetched 100 rows (total so far: 2500)
Fetched 100 rows (total so far: 2600)
Fetched 100 rows (tot

In [None]:
import requests
import time

offset = 3600
while True:
    rows = get_dataset_rows(dataset, config, split, offset, limit)
    if not rows:
        break  # Stop if no more data
    all_rows.extend(rows)
    print(f"Fetched {len(rows)} rows (total so far: {len(all_rows)})")
    offset += limit
    time.sleep(2)  # Respect API rate limits

print(f"Total rows collected: {len(all_rows)}")


Erreur HTTP 429
Total rows collected: 3600


In [None]:
import pandas as pd
df=pd.DataFrame(data)
# Assuming 'data' is already populated from the previous code


In [None]:
# prompt: transform this dataset to dataframe labeled with job_description_text,resume_text and fit

import pandas as pd

# Assum ing 'data' is already populated from the previous code

if data:
    job_descriptions = []
    resume_texts = []
    fits = []

    for row in data:
        row=row['row']
        job_descriptions.append(row.get('job_description_text', ''))  # Handle missing keys
        resume_texts.append(row.get('resume_text', ''))
        fits.append(row.get('label', ''))

    df = pd.DataFrame({
        'job_description_text': job_descriptions,
        'resume_text': resume_texts,
        'label': fits
    })

df.head()
len(df)

3600

In [None]:
df['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':1,'Good Fit':1})
df.head()

  df['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':1,'Good Fit':1})


Unnamed: 0,job_description_text,resume_text,label
0,Net2Source Inc. is an award-winning total work...,SummaryHighly motivated Sales Associate with e...,0
1,At Salas OBrien we tell our clients that were ...,Professional SummaryCurrently working with Cat...,0
2,Schweitzer Engineering Laboratories (SEL) Infr...,SummaryI started my construction career in Jun...,0
3,"Mizick Miller & Company, Inc. is looking for a...",SummaryCertified Electrical Foremanwith thirte...,0
4,Life at Capgemini\nCapgemini supports all aspe...,SummaryWith extensive experience in business/r...,0


In [None]:
import pandas as pd
df_subset=pd.concat([df[0:100],df[3300:3400],df[6000:6100]])


In [None]:
df_subset.head()

Unnamed: 0,job_description_text,resume_text,label
0,Net2Source Inc. is an award-winning total work...,SummaryHighly motivated Sales Associate with e...,0
1,At Salas OBrien we tell our clients that were ...,Professional SummaryCurrently working with Cat...,0
2,Schweitzer Engineering Laboratories (SEL) Infr...,SummaryI started my construction career in Jun...,0
3,"Mizick Miller & Company, Inc. is looking for a...",SummaryCertified Electrical Foremanwith thirte...,0
4,Life at Capgemini\nCapgemini supports all aspe...,SummaryWith extensive experience in business/r...,0


In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import os

# 1. Convert df_subset to InputExamples with similarity score (0 to 1)
examples = [
    InputExample(
        texts=[row["resume_text"], row["job_description_text"]],
        label=float(row["label"])  # This must be a float between 0 and 1
    ) for _, row in df_subset.iterrows()
]

# 2. Train/dev split
train_examples, dev_examples = train_test_split(examples, test_size=0.1, random_state=42)

# 3. DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
dev_dataloader = DataLoader(dev_examples, shuffle=False, batch_size=16)

# 4. Load pretrained SBERT model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 5. CosineSimilarityLoss (for similarity regression)
train_loss = losses.CosineSimilarityLoss(model=model)

# 6. Evaluator based on cosine similarity
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_examples, name='dev-similarity')

# 7. Output path
output_path = "./output/fine-tuned-sbert-similarity"
os.makedirs(output_path, exist_ok=True)

# 8. Train using model.fit (train_objectives = list of (DataLoader, Loss) tuples)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],  # ✅ Tuple inside list
    evaluator=evaluator,
    epochs=4,
    evaluation_steps=500,
    warmup_steps=100,
    output_path=output_path,
    save_best_model=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [None]:
!pip install -U "sentence-transformers[train]" # Install the 'accelerate' package, required for SentenceTransformerTrainer
import requests
import time
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, SentenceTransformerTrainer, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import os
import torch
# Import IterableDataset from torch.utils.data
from torch.utils.data import IterableDataset
import torch.utils.data
import builtins
builtins.IterableDataset = torch.utils.data.IterableDataset


output_path = "./output/fine-tuned-sbert"
os.makedirs(output_path, exist_ok=True)
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_examples,       # ✅ RIGHT: list of InputExample
    evaluator=evaluator,
    loss=train_loss,
    # The 'evaluation_steps' and 'warmup_steps' arguments are now passed to the 'fit' method.
    # evaluation_steps=500,
    # warmup_steps=100,
)

# === 9. Train the Model ===
trainer.fit(epochs=4, evaluation_steps=500, warmup_steps=100,    output_path=output_path,    save_best_model=True

) # Pass the arguments here

# === 10. Load and test the fine-tuned model (optional) ===
fine_tuned_model = SentenceTransformer(output_path)
# Example: encode a resume and job description
resume_text = "Experienced machine learning engineer with Python experience"
job_description_text = "Seeking a data scientist with expertise in Python"

resume_embedding = fine_tuned_model.encode(resume_text)
job_description_embedding = fine_tuned_model.encode(job_description_text)
# Now you can use the embeddings for further analysis
# (e.g., calculate cosine similarity)



AttributeError: 'list' object has no attribute 'column_names'

In [None]:
from sentence_transformers import SentenceTransformer

model=[]
model.append(SentenceTransformer('all-MiniLM-L12-v2'))
model.append( SentenceTransformer('thenlper/gte-large') )
model.append( SentenceTransformer('sentence-transformers/all-mpnet-base-v2') )
model.append( SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
df_subset=df.head(100)


In [None]:
from sentence_transformers import util
def calculate_cosine_similarity(model, sentence1, sentence2):
    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)
    similarity = util.cos_sim(embedding1, embedding2)
    return similarity

In [None]:
calculate_cosine_similarity(model[0],df_subset['job_description_text'][0],df_subset['resume_text'][0])

tensor([[0.3938]])

In [None]:
df_subset['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':2,'Good Fit':1})


  df_subset['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':2,'Good Fit':1})


In [None]:
df_subset['label'].values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
# prompt: i want to add cosine result of cosine similarity as a third column to df
cosine_similarities = []
for index, row in df_subset.iterrows():
    similarity = calculate_cosine_similarity(model[0], row['job_description_text'], row['resume_text'])
    cosine_similarities.append(similarity.item())  # Extract the scalar value from the tensor

df_subset['cosine_similarity'] = cosine_similarities


In [None]:
df_subset.head()

Unnamed: 0,job_description_text,resume_text,label,cosine_similarity,predict
0,Net2Source Inc. is an award-winning total work...,SummaryHighly motivated Sales Associate with e...,0,0.393769,0
1,At Salas OBrien we tell our clients that were ...,Professional SummaryCurrently working with Cat...,0,0.253783,0
2,Schweitzer Engineering Laboratories (SEL) Infr...,SummaryI started my construction career in Jun...,0,0.396062,0
3,"Mizick Miller & Company, Inc. is looking for a...",SummaryCertified Electrical Foremanwith thirte...,0,0.184229,0
4,Life at Capgemini\nCapgemini supports all aspe...,SummaryWith extensive experience in business/r...,0,0.191772,0


In [None]:
# prompt: i want to get all values of predict

print(df_subset['predict'].values)
print(df_subset['label'].values)

[0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 2 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0
 0 0 0 2 0 0 2 0 0 2 0 2 0 2 2 0 2 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0
 0 2 2 0 0 2 0 0 2 2 2 2 0 2 0 2 2 0 2 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0
 2 2 2 0 0 2 0 0 2 2 2 0 0 2 2 2 0 0 2 0 0 2 0 2 2 0 0 0 0 0 0 2 2 0 2 0 2
 0 2 0 0 0 0 2 0 2 0 0 0 2 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [None]:
predict=[]
for index,row in df_subset.iterrows():
  predict.append(0 if row['cosine_similarity'] <0.3 else 1 if row['cosine_similarity']>0.7 else 2)
df_subset['predict']=predict

# prompt: calculate accuracy between label and predict

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df_subset['label'], df_subset['predict'])
print(f"Accuracy: {accuracy}")


Accuracy: 0.4041666666666667


In [None]:
all_cosine_similarities = []
all_predict=[]
all_accuracy=[]
for i in range(0,4):
  cosine_similarities = []
  for index, row in df_subset.iterrows():
      similarity = calculate_cosine_similarity(model[i], row['job_description_text'], row['resume_text'])
      cosine_similarities.append(similarity.item())  # Extract the scalar value from the tensor
  all_cosine_similarities.append(cosine_similarities)
  df_subset[f'cosine_similarity{i}'] = cosine_similarities

  predict=[]
  for index,row in df_subset.iterrows():
    predict.append(0 if row['cosine_similarity'] <0.3 else 1 if row['cosine_similarity']>0.8 else 2)
  df_subset[f'predict{i}']=predict
  all_predict.append(predict)
  # prompt: calculate accuracy between label and predict

  accuracy = accuracy_score(df_subset['label'], df_subset[f'predict{i}'])
  all_accuracy.append(accuracy)
  print(f"Accuracy of model {i}: {accuracy}")


Accuracy of model 0: 0.4041666666666667
Accuracy of model 1: 0.4041666666666667
Accuracy of model 2: 0.4041666666666667
Accuracy of model 3: 0.4041666666666667


In [None]:
import pandas as pd

df_subset=pd.concat([df[0:100],df[3300:3400],df[6000:6100]])


In [None]:
df_subset.head()

Unnamed: 0,job_description_text,resume_text,label
0,Net2Source Inc. is an award-winning total work...,SummaryHighly motivated Sales Associate with e...,No Fit
1,At Salas OBrien we tell our clients that were ...,Professional SummaryCurrently working with Cat...,No Fit
2,Schweitzer Engineering Laboratories (SEL) Infr...,SummaryI started my construction career in Jun...,No Fit
3,"Mizick Miller & Company, Inc. is looking for a...",SummaryCertified Electrical Foremanwith thirte...,No Fit
4,Life at Capgemini\nCapgemini supports all aspe...,SummaryWith extensive experience in business/r...,No Fit


In [None]:
df_subset['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':2,'Good Fit':1})

  df_subset['label'] = df['label'].replace({'No Fit': 0,'Potential Fit':2,'Good Fit':1})


In [None]:
len(df_subset)

NameError: name 'df_subset' is not defined

In [None]:
!pip install datasets
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset

# Assuming 'train_data' is your list of InputExample objects

# 1. Convert train_data into a pandas DataFrame:
data = []
for index, row in df_subset.iterrows():
  data.append(
      InputExample(
          texts=[str(row['job_description_text']), str(row['resume_text'])],
          label=int(row['label'])
      )
  )

model = SentenceTransformer("all-MiniLM-L6-v2")

# 4. Create a DataLoader
train_dataloader = DataLoader(data, shuffle=True, batch_size=16)

# 5. Define SoftmaxLoss for 3-class classification
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=3  # because labels are 0, 1, 2
)

# 6. Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True,
    output_path="output/fine-tuned-model"
)

# 7. Save the model
model.save("output/fine-tuned-model")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union