# 1. Section Marker

In [None]:
import tempfile
import tarfile
import shutil
import os

In [None]:
BASE_DATA_DIR = "../data"

datasets = {
    "20_news_dataset.tar.gz": "20_news",
    "multi_domain_sentiment_dataset.tar.gz": "multi_domain_sentiment"
}

In [None]:
os.makedirs(BASE_DATA_DIR, exist_ok = True)

In [None]:
for gz_file, folder_name in datasets.items():
	gz_path = os.path.join(BASE_DATA_DIR, gz_file)
	target_dir = os.path.join(BASE_DATA_DIR, folder_name)

	if os.path.exists(target_dir) and len(os.listdir(target_dir)) > 0:
		print(f"Folder {folder_name} already extracted")
		continue

	if os.path.exists(gz_path):
		print(f"Extracting {gz_file} into '{folder_name}'...")
		os.makedirs(target_dir, exist_ok=True)

		with tempfile.TemporaryDirectory() as tmp_dir:
			with tarfile.open(gz_path, "r:gz") as tar:
				tar.extractall(path = tmp_dir)

			for item in os.listdir(tmp_dir):
				src_path = os.path.join(tmp_dir, item)
				if os.path.isdir(src_path):
					for sub_item in os.listdir(src_path):
						shutil.move(os.path.join(src_path, sub_item), target_dir)
				else:
					shutil.move(src_path, target_dir)
	else:
		print(f"File not found: {gz_path}")


# 2. Section Marker

In [None]:
import pandas as pd
import os

In [None]:
NEWS_DIR = os.path.join(BASE_DATA_DIR, "20_news")
MULTIDOMAIN_DIR = os.path.join(BASE_DATA_DIR, "multi_domain_sentiment")

In [None]:
def load_20_news(base_dir):
	data = []

	for category in os.listdir(base_dir):
		category_path = os.path.join(base_dir, category)
		if not os.path.isdir(category_path):
			continue
		
		for filename in os.listdir(category_path):
			file_path = os.path.join(category_path, filename)
			try:
				with open(file_path, "rb") as f:
					raw = f.read()
				try:
					text = raw.decode("utf-8").strip()
				except UnicodeDecodeError:
					text = raw.decode("latin-1").strip()

				data.append({
					"label": category, 
					"document": text
				})

			except Exception as e:
				print(f"Error reading {file_path}: {e}")
	
	return pd.DataFrame(data)

In [None]:
def load_multidomain(base_dir):
	data = []

	for domain in os.listdir(base_dir):
		domain_path = os.path.join(base_dir, domain)
		if not os.path.isdir(domain_path):
			continue
	
		for filename in os.listdir(domain_path):
			file_path = os.path.join(domain_path, filename)
			try:
				with open(file_path, "rb") as f:
					raw = f.read()
				try:
					content = raw.decode("utf-8").strip()
				except UnicodeDecodeError:
					content = raw.decode("latin-1").strip()

				for line in content.splitlines():
					line = line.strip()
					if not line:
						continue

					if "#label#:" in line:
						text_part, label_part = line.split("#label#:")
						label = label_part.strip()
					else:
						text_part = line
						label = None

					tokens = [tok.split(":")[0] for tok in text_part.split()]
					text = " ".join(tokens)

					data.append({
						"document": text,
						"label": label,
					})

			except Exception as e:
				print(f"Error reading {file_path}: {e}")

	return pd.DataFrame(data)


In [None]:
NEWS_DATAFRAME = load_20_news(NEWS_DIR)
MULTIDOMAIN_DATAFRAME = load_multidomain(MULTIDOMAIN_DIR)

In [None]:
NEWS_DATAFRAME

In [None]:
MULTIDOMAIN_DATAFRAME

# 3. Section marker

In [None]:
import re

In [None]:
def clean_20_news(text):
    # Remove common email headers
    text = re.sub(r'^(From|Subject|Lines|Organization|Reply-To|NNTP-Posting-Host|Keywords|Summary):.*$', '', text, flags = re.MULTILINE)

    # Remove email addresses and URLs
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove lines of signatures or separators
    text = re.sub(r'--+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'_+', '', text)

    # Remove quoted lines (beginning with > or :)
    text = re.sub(r'(^>.*$|^:.*$)', '', text, flags=re.MULTILINE)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [None]:
def clean_multidomain(text):
    # Replace underscores with spaces
    text = text.replace("_", " ")

    # Remove special tokens like <num>
    text = re.sub(r"<num>", "", text)

    # Remove multiple spaces

    return text.strip()

In [None]:
NEWS_DATAFRAME["document_clean"] = NEWS_DATAFRAME["document"].apply(clean_20_news)
MULTIDOMAIN_DATAFRAME["document_clean"] = MULTIDOMAIN_DATAFRAME["document"].apply(clean_multidomain)

In [None]:
NEWS_DATAFRAME

In [None]:
MULTIDOMAIN_DATAFRAME

# 4. Section Marker

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder_news = LabelEncoder()
label_encoder_news.fit(NEWS_DATAFRAME["label"])

In [None]:
NEWS_DATAFRAME["label_id"] = label_encoder_news.transform(NEWS_DATAFRAME["label"])

In [None]:
label_encoder_multidomain = LabelEncoder()
label_encoder_multidomain.fit(MULTIDOMAIN_DATAFRAME["label"])

In [None]:
MULTIDOMAIN_DATAFRAME["label_id"] = label_encoder_multidomain.transform(MULTIDOMAIN_DATAFRAME["label"])

# 5. Section Marker

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 42

In [None]:
def split_dataframe(dataframe):
    
	X = dataframe["document_clean"]
	y = dataframe["label_id"]

	# 60% for training, 40% for second split
	X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state = RANDOM_SEED, stratify = y)

	# 10% for validation, 30% for test
	X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.75, random_state = RANDOM_SEED, stratify = y_temp)

	train_df = X_train.to_frame("text")
	train_df["label"] = y_train.values

	val_df = X_val.to_frame("text")
	val_df["label"] = y_val.values

	test_df = X_test.to_frame("text")
	test_df["label"] = y_test.values

	print(f"Train size: {len(train_df)}")
	print(f"Val size:   {len(val_df)}")
	print(f"Test size:  {len(test_df)}")

	return train_df, val_df, test_df

In [None]:
train_df_news, val_df_news, test_df_news = split_dataframe(NEWS_DATAFRAME)

In [None]:
train_df_multidomain, val_df_multidomain, test_df_multidomain = split_dataframe(MULTIDOMAIN_DATAFRAME)

# 6. Section Marker

In [None]:
from transformers import AutoTokenizer

In [None]:
bert_base_uncased_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)

# 7. Section Marker

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class CustomDataset(Dataset):
    
	def __init__(self, data, tokenizer):
		self.embeddings = tokenizer(data["text"].values.tolist(), padding = 'max_length', truncation = True, max_length = 256, return_tensors = 'pt')
		self.labels = torch.tensor(data['label'].values).long()


	def __getitem__(self, idx):
		return {
			"input_ids": self.embeddings["input_ids"][idx],
			"token_type_ids": self.embeddings["token_type_ids"][idx],
			"attention_mask": self.embeddings["attention_mask"][idx],
			"labels": self.labels[idx]
    }

	def __len__(self):
		return len(self.labels)

In [None]:
train_news_dataset = CustomDataset(train_df_news, bert_base_uncased_tokenizer)
val_news_dataset = CustomDataset(val_df_news, bert_base_uncased_tokenizer)
test_news_dataset = CustomDataset(test_df_news, bert_base_uncased_tokenizer)

train_news_loader = DataLoader(train_news_dataset, batch_size = 32, shuffle = True)
val_news_loader = DataLoader(val_news_dataset, batch_size = 32, shuffle = True)
test_news_loader = DataLoader(test_news_dataset, batch_size = 32, shuffle = True)

In [None]:
train_multidomain_dataset = CustomDataset(train_df_multidomain, bert_base_uncased_tokenizer)
val_multidomain_dataset = CustomDataset(val_df_multidomain, bert_base_uncased_tokenizer)
test_multidomain_dataset = CustomDataset(test_df_multidomain, bert_base_uncased_tokenizer)

train_multidomain_loader = DataLoader(train_multidomain_dataset, batch_size = 32, shuffle = True)
val_multidomain_loader = DataLoader(val_multidomain_dataset, batch_size = 32, shuffle = True)
test_multidomain_loader = DataLoader(test_multidomain_dataset, batch_size = 32, shuffle = True)

# 9. Section Marker

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoModelForSequenceClassification
from tqdm import tqdm

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def get_accuracy(y, output, size):
  y_pred = output.argmax(dim = -1).reshape(-1)
  return (y.reshape(-1) == y_pred).sum().item() / size

In [None]:
def train(dataloader, model, loss_function, optimizer):
    
	loss = 0
	accuracy = 0
	batch_num = 0

	model.train()

	for batch in tqdm(dataloader):
		batch_num += 1
		batch = {key: v.to(device) for key, v in batch.items()}
		output = model(**batch)
		optimizer.zero_grad()
		batch_loss = loss_function(output.logits, batch["labels"])
		batch_loss.backward()
		optimizer.step()
		loss += batch_loss.item()

		accuracy += get_accuracy(batch["labels"], output.logits.detach(), len(dataloader.dataset))

	loss = loss / batch_num
	print(f"Loss: {loss}, Accuracy: {accuracy}")

	return loss, accuracy


In [None]:
def evaluate(dataloader, model, loss_function):

  loss = 0
  accuracy = 0
  batch_num = 0

  model.eval()
  
  with torch.no_grad():
    for batch in tqdm(dataloader):
      batch_num += 1
      batch = {key: v.to(device) for key, v in batch.items()}
      output = model(**batch)
      batch_loss = loss_function(output.logits, batch["labels"])
      loss += batch_loss.item()
      accuracy += get_accuracy(batch["labels"], output.logits.detach(), len(dataloader.dataset))

  loss = loss / batch_num

  print(f"Loss: {loss}, Accuracy: {accuracy}")

  return loss, accuracy

In [None]:
def evaluate_model(model, dataloader, device, print_name, checkpoint_path = None, num_labels = None, model_name = None):

	if checkpoint_path is not None:
		if model_name and num_labels:
			model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)

	state_dict = torch.load(checkpoint_path, map_location=device)
	if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
		new_state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
		state_dict = new_state_dict

	model.load_state_dict(state_dict)
	model.to(device)
	model.eval()

	all_preds, all_labels = [], []
	with torch.no_grad():
		for batch in tqdm(dataloader, desc = "Evaluating model"):
			batch = {k: v.to(device) for k, v in batch.items()}
			outputs = model(**batch)
			preds = torch.argmax(outputs.logits, dim = -1)
			all_preds.extend(preds.cpu().numpy())
			all_labels.extend(batch["labels"].cpu().numpy())

	acc = accuracy_score(all_labels, all_preds)
	precision_micro = precision_score(all_labels, all_preds, average = 'micro')
	recall_micro = recall_score(all_labels, all_preds, average = 'micro')
	f1_micro = f1_score(all_labels, all_preds, average = 'micro')

	precision_macro = precision_score(all_labels, all_preds, average = 'macro')
	recall_macro = recall_score(all_labels, all_preds, average = 'macro')
	f1_macro = f1_score(all_labels, all_preds, average = 'macro')

	report = classification_report(all_labels, all_preds, digits = 4)

	print(f"\nFinal results {print_name} in test")
	print(f"Accuracy: {acc:.4f}")
	print(f"Precision (micro): {precision_micro:.4f}")
	print(f"Recall (micro): {recall_micro:.4f}")
	print(f"F1 (micro): {f1_micro:.4f}")
	print(f"Precision (macro): {precision_macro:.4f}")
	print(f"Recall (macro): {recall_macro:.4f}")
	print(f"F1 (macro): {f1_macro:.4f}")
	print("\nClassification Report:")
	print(report)

# 10. Section Marker

In [None]:
from torch.optim import Adam
import torch.nn as nn
import torch

In [None]:
model_20_news = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 20).to(device)

In [None]:
model_20_news

In [None]:
model_20_news = torch.compile(model_20_news.to(device), backend = "eager")
loss_function = nn.CrossEntropyLoss()
optimizer = Adam(model_20_news.parameters(), lr = 1e-5)
model_20_news = model_20_news.to(device)

In [None]:
num_epochs = 10
best_val_acc = 0.0

for epoch in range(num_epochs):
    print(f"\n  Epoch {epoch+1}/{num_epochs}")

    train_loss, train_acc = train(train_news_loader, model_20_news, loss_function, optimizer)
    val_loss, val_acc = evaluate(val_news_loader, model_20_news, loss_function)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_20_news.state_dict(), "../models/best_bert_base_uncased_model_news.pt")
        print("Model saved (best acc in validation so far)")


In [None]:
evaluate_model(model = None, 
               dataloader = test_news_loader, 
               device = device, 
               checkpoint_path = "../models/best_bert_base_uncased_model_news.pt", 
               num_labels = 20, 
               model_name = "bert-base-uncased", 
               print_name = "BERT BASE UNCASED (20 NEWS)")

# 11. Section Marker

In [None]:
from torch.optim import Adam
import torch.nn as nn
import torch

In [None]:
model_multidomain = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2).to(device)

In [None]:
model_multidomain

In [None]:
model_multidomain = torch.compile(model_multidomain.to(device), backend = "eager")
loss_function = nn.CrossEntropyLoss()
optimizer = Adam(model_multidomain.parameters(), lr = 1e-5)
model_multidomain = model_multidomain.to(device)

In [None]:
num_epochs = 10
best_val_acc = 0.0

for epoch in range(num_epochs):
    print(f"\n  Epoch {epoch+1}/{num_epochs}")

    train_loss, train_acc = train(train_multidomain_loader, model_multidomain, loss_function, optimizer)
    val_loss, val_acc = evaluate(val_multidomain_loader, model_multidomain, loss_function)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_multidomain.state_dict(), "../models/best_bert_base_uncased_model_multidomain.pt")
        print("Model saved (best acc in validation so far)")


In [None]:
evaluate_model(model = None, 
               dataloader = test_multidomain_loader, 
               device = device, 
               checkpoint_path = "../models/best_bert_base_uncased_model_multidomain.pt", 
               num_labels = 2, 
               model_name = "bert-base-uncased", 
               print_name = "BERT BASE UNCASED (MULTIDOMAIN)")