In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import torch
from concurrent.futures import ThreadPoolExecutor
import numpy as np

In [2]:
# Load the model and optimizer from the checkpoint
model_path = 'final_model/'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
df = pd.read_excel("file_path")

In [11]:
df

Unnamed: 0,Domains,Web_Text
0,news.yahoo.com,yahoo news latest breaking news headlines live...
1,internationalinvestment.net,computing uk lead source analysis business tec...
2,cysec.gov.cy,επιτροπή κεφαλαιαγοράς κύπρου η αποστολή μας ε...
3,find-and-update.company-information.service.go...,find update company information company inform...
4,laverdadpa.com,la verdad panamá es una página de noticiassala...
...,...,...
13383,hipertextual.com,hipertextual tecnología ciencia y cultura digi...
13384,mdzol.com,mdz onlineel diario más importante de la argen...
13385,tycsports.com,tyc sports las noticia de deportes del canal e...
13386,a24films.com,a24the company talk uncut gems midsommar lady ...


In [10]:
df = df.dropna()

In [None]:
# Check for NaN values in 'Web_Text' and replace them with empty strings
df['Web_Text'] = df['Web_Text'].apply(lambda x: ' '.join(str(x).split()[:1024]) if pd.notna(x) else '')

#Remove "http://" from each value in the URL column
#df['URL'] = df['URL'].str.replace('http://', '')


In [13]:
# Function to process each row in parallel
def process_row(website, web_text):
    inputs = tokenizer(website, web_text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).cpu().item()
    
    return {'URL': website, 'Category': predicted_label}

In [14]:
# Create a result_df with columns URL and Category (predicted label)
new_data = pd.DataFrame({'website': df['Domains'], 'Web_Text': df['Web_Text']})

# Number of threads
num_threads = 8  # You can adjust this based on your system and resources

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Process each row in parallel
    results = list(executor.map(process_row, new_data['website'], new_data['Web_Text']))

# Concatenate the results into the final dataframe
result_df = pd.concat([pd.DataFrame(results)], ignore_index=True)

# Replace encoded values with original category using category_mapping.csv
category_mapping = pd.read_csv('category_mapping.csv')
result_df['Category'] = result_df['Category'].map(category_mapping.set_index('Encoded_Value')['Original_Category'])