In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

In [None]:
import re

## Download and Preprocess dataset

In [None]:
df = pd.read_csv("C:/Clubs and Society/DevHack/DevHack/dataset/Resume/Resume.csv")
df = df[['Resume_str', 'Category']]
df.head(5)

In [None]:
def extract_title_and_skills(resume_text):
    res = ""
    # Extract title
    title_pattern = re.compile(r"(?i)(.*?)(?=\bSummary\b)", re.DOTALL)
    title_match = title_pattern.search(resume_text)
    title = title_match.group(1).strip() if title_match else None
    res += title + "\n"

    # Extract skills
    skills_pattern = re.compile(r"(?i)\bSkills\b.*?\n(.*?)\n", re.DOTALL)
    skills_match = skills_pattern.search(resume_text)
    skills = skills_match.group(1).strip() if skills_match else None
    res += skills

    return res

In [None]:
df['title_skills'] = df['Resume_str'].apply(extract_title_and_skills)
df = df[['title_skills', 'Category']]
df.head(5)

In [None]:
#Convert data to a DataFrame
df = pd.DataFrame(df)

#Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

# Tokenize and preprocess the data
def preprocess_text(text, max_len=512):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt"
    )
    return tokens

# Apply the preprocessing function to the 'Resume_str' column
df['tokens'] = df['Resume_str'].apply(preprocess_text)

# Create input tensors
input_ids = torch.cat([tensor['input_ids'] for tensor in df['tokens']], dim=0)
attention_masks = torch.cat([tensor['attention_mask'] for tensor in df['tokens']], dim=0)

# Create labels tensor
labels = torch.tensor(df['Category'].astype('category').cat.codes.values)

# Create a PyTorch DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
