In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
import numpy

## 1. Load Dataset

In [2]:
if torch.cuda.is_available:
  print('GPU available')
else:
  print('Please use CPU.')

GPU available


In [5]:
df = pd.read_csv("/content/drive/MyDrive/Web_Final.csv")

In [6]:
df

Unnamed: 0,Website,Category,Web_Text
0,http://welt.sn2world.com,Lifestyle,welt deutsches und informationsportaltmwatch s...
1,http://promistarsnews.com,News,home christina luft luca hänni ein baby bald w...
2,http://promi-newsheute.com,Entertainment,home lillet winter thyme das ist der perfekte ...
3,http://promiheute.com,Entertainment,home bossa technology führt den neuen trend de...
4,http://promivipnews.com,News,home bossa technology führt den neuen trend de...
...,...,...,...
30068,http://snegohod-rybinsk.ru,Sports,
30069,http://ikpik.ru,Other categories,интересные факты нас вы найдёте множество инте...
30070,http://lg-optimus.net,Technology,
30071,http://celebcenter.ru,Entertainment,


In [7]:
# Check for NaN values in 'Web_Text' and replace them with empty strings
df['Web_Text'] = df['Web_Text'].apply(lambda x: ' '.join(str(x).split()[:25]) if pd.notna(x) else '')

# Truncate and keep only the first 25 words in 'Web_Text'
df['Web_Text'] = df['Web_Text'].apply(lambda x: ' '.join(str(x).split()[:25]))

In [8]:
# Encode the labels using LabelEncoder
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Create a dictionary to store the mapping of original categories to encoded values
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [9]:
import csv

# Your existing code to create the dictionary
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Specify the filename for the CSV file
csv_filename = 'category_mapping.csv'

# Write the dictionary to the CSV file
with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ['Original_Category', 'Encoded_Value']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the data
    for original_category, encoded_value in category_mapping.items():
        writer.writerow({'Original_Category': original_category, 'Encoded_Value': encoded_value})

print(f"Category mapping has been saved to {csv_filename}")

Category mapping has been saved to category_mapping.csv


In [10]:
df.head()

Unnamed: 0,Website,Category,Web_Text
0,http://welt.sn2world.com,95,welt deutsches und informationsportaltmwatch s...
1,http://promistarsnews.com,113,home christina luft luca hänni ein baby bald w...
2,http://promi-newsheute.com,54,home lillet winter thyme das ist der perfekte ...
3,http://promiheute.com,54,home bossa technology führt den neuen trend de...
4,http://promivipnews.com,113,home bossa technology führt den neuen trend de...


In [11]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        website = str(self.data.iloc[idx]['Website'])
        web_text = str(self.data.iloc[idx]['Web_Text'])
        label = int(self.data.iloc[idx]['Category'])
        return website, web_text, label

## 2. Choosing a Model

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

# Load pre-trained model and tokenizer (DistilBERT)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(label_encoder.classes_))
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [25]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, GPT2Config
import torch
from tqdm import tqdm

# Load GPT-2 tokenizer and model for sequence classification
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name)

# Set up training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Add a new pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## 3. Model Training

In [12]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [13]:
# Create DataLoader for training and testing
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [19]:
# Define training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Change optimizer to SGD
#optimizer = torch.optim.SGD(model.parameters(), lr=2e-3, momentum=0.9, weight_decay=1e-5)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
# Training loop
num_epochs = 10
checkpoint_interval = 1

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        websites, web_texts, labels = batch
        inputs = tokenizer(websites, web_texts, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Free up memory
        del outputs, loss
        torch.cuda.empty_cache()

    # Evaluation
    model.eval()
    predictions = []
    true_labels = []

    for batch in tqdm(test_loader, desc='Evaluating'):
        websites, web_texts, labels = batch
        inputs = tokenizer(websites, web_texts, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

        # Free up memory
        del outputs
        torch.cuda.empty_cache()

    # Save the model checkpoint
    if (epoch + 1) % checkpoint_interval == 0:
        model.save_pretrained(f'fine_tuned_model_epoch{epoch + 1}')
        tokenizer.save_pretrained(f'fine_tuned_model_epoch{epoch + 1}')

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}")


## 4. Training from Particular Epoch

In [None]:
# Specify the epoch you want to start from
start_epoch = 5
num_epochs = 7
checkpoint_interval = 1

# Load the model and optimizer from the checkpoint
model_path = f'fine_tuned_bert_model_epoch{start_epoch}'

# Move model to the device
model.to(device)

# Training loop
for epoch in range(start_epoch, num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        websites, web_texts, labels = batch
        inputs = tokenizer(websites, web_texts, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Free up memory
        del outputs, loss
        torch.cuda.empty_cache()

    # Evaluation
    model.eval()
    predictions = []
    true_labels = []

    for batch in tqdm(test_loader, desc='Evaluating'):
        websites, web_texts, labels = batch
        inputs = tokenizer(websites, web_texts, return_tensors="pt", truncation=True, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

        # Free up memory
        del outputs
        torch.cuda.empty_cache()

    # Save the model checkpoint
    if (epoch + 1) % checkpoint_interval == 0:
        model.save_pretrained(f'fine_tuned_bert_model_epoch{epoch + 1}')
        tokenizer.save_pretrained(f'fine_tuned_bert_model_epoch{epoch + 1}')

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    accuracy = round(accuracy*100,2)
    print(f"Accuracy: {accuracy}")

## 5. Saving Model

In [None]:
torch.save(model.state_dict(), "/content/bert-final")
torch.save(model, "/content/bert-final")

In [None]:
torch.save({
            'epoch': 6,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, "/content/bert-6")

In [None]:
# Load the model and optimizer from the checkpoint
model_path = f'fine_tuned_bert_model_epoch{start_epoch}'

# Move model to the device
model.to(device)

In [None]:
# Save the final fine-tuned model
model.save_pretrained('final_model')
tokenizer.save_pretrained('final_model')
label_encoder.classes_.dump('label_encoder_classes.npy')

In [None]:
model_path = "final_model"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# Move model to the device
model.to(device)

## 6. Testing the Model

In [None]:
text = '''
biržiečių žodispagrindinė navigacija biržietis naujausios šiaulių krašta rekomenduojama aktualijos komentarai teisėtvarka kultūra sveikata gyvenimo spalvos sportas susitikimai nuorodos pereiti
'''

In [None]:
# Assuming df is your DataFrame with columns 'website' and 'Web_Text'
df = pd.DataFrame({'website': ['birzietis.lt'], 'Web_Text': text})

# Tokenize and preprocess the new data
inputs = tokenizer(df['website'].tolist(), df['Web_Text'].tolist(), return_tensors="pt", truncation=True, padding=True)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform inference
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Get predictions
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()

# Print predicted labels
print("Predicted Labels:", predicted_labels)

Predicted Labels: [41]
