# 1. Section Marker

In [1]:
import tempfile
import tarfile
import shutil
import os

In [2]:
BASE_DATA_DIR = "../data"

datasets = {
    "20_news_dataset.tar.gz": "20_news",
    "multi_domain_sentiment_dataset.tar.gz": "multi_domain_sentiment"
}

In [3]:
os.makedirs(BASE_DATA_DIR, exist_ok = True)

In [4]:
for gz_file, folder_name in datasets.items():
	gz_path = os.path.join(BASE_DATA_DIR, gz_file)
	target_dir = os.path.join(BASE_DATA_DIR, folder_name)

	if os.path.exists(target_dir) and len(os.listdir(target_dir)) > 0:
		print(f"Folder {folder_name} already extracted")
		continue

	if os.path.exists(gz_path):
		print(f"Extracting {gz_file} into '{folder_name}'...")
		os.makedirs(target_dir, exist_ok=True)

		with tempfile.TemporaryDirectory() as tmp_dir:
			with tarfile.open(gz_path, "r:gz") as tar:
				tar.extractall(path = tmp_dir)

			for item in os.listdir(tmp_dir):
				src_path = os.path.join(tmp_dir, item)
				if os.path.isdir(src_path):
					for sub_item in os.listdir(src_path):
						shutil.move(os.path.join(src_path, sub_item), target_dir)
				else:
					shutil.move(src_path, target_dir)
	else:
		print(f"File not found: {gz_path}")


Folder 20_news already extracted
Folder multi_domain_sentiment already extracted


# 2. Section Marker

In [5]:
import pandas as pd
import os

In [6]:
NEWS_DIR = os.path.join(BASE_DATA_DIR, "20_news")
MULTIDOMAIN_DIR = os.path.join(BASE_DATA_DIR, "multi_domain_sentiment")

In [7]:
def load_20_news(base_dir):
	data = []

	for category in os.listdir(base_dir):
		category_path = os.path.join(base_dir, category)
		if not os.path.isdir(category_path):
			continue
		
		for filename in os.listdir(category_path):
			file_path = os.path.join(category_path, filename)
			try:
				with open(file_path, "rb") as f:
					raw = f.read()
				try:
					text = raw.decode("utf-8").strip()
				except UnicodeDecodeError:
					text = raw.decode("latin-1").strip()

				data.append({
					"label": category, 
					"document": text
				})

			except Exception as e:
				print(f"Error reading {file_path}: {e}")
	
	return pd.DataFrame(data)

In [8]:
def load_multidomain(base_dir):
	data = []

	for domain in os.listdir(base_dir):
		domain_path = os.path.join(base_dir, domain)
		if not os.path.isdir(domain_path):
			continue
	
		for filename in os.listdir(domain_path):
			file_path = os.path.join(domain_path, filename)
			try:
				with open(file_path, "rb") as f:
					raw = f.read()
				try:
					content = raw.decode("utf-8").strip()
				except UnicodeDecodeError:
					content = raw.decode("latin-1").strip()

				for line in content.splitlines():
					line = line.strip()
					if not line:
						continue

					if "#label#:" in line:
						text_part, label_part = line.split("#label#:")
						label = label_part.strip()
					else:
						text_part = line
						label = None

					tokens = [tok.split(":")[0] for tok in text_part.split()]
					text = " ".join(tokens)

					data.append({
						"document": text,
						"label": label,
					})

			except Exception as e:
				print(f"Error reading {file_path}: {e}")

	return pd.DataFrame(data)


In [9]:
NEWS_DATAFRAME = load_20_news(NEWS_DIR)
MULTIDOMAIN_DATAFRAME = load_multidomain(MULTIDOMAIN_DIR)

In [10]:
NEWS_DATAFRAME

Unnamed: 0,label,document
0,sci.crypt,From: tcmay@netcom.com (Timothy C. May)\nSubje...
1,sci.crypt,"From: ""Jon \\lnes"" <jon@ifi.uio.no>\nSubject: ..."
2,sci.crypt,From: hooper@ccs.QueensU.CA (Andy Hooper)\nSub...
3,sci.crypt,From: warlord@MIT.EDU (Derek Atkins)\nSubject:...
4,sci.crypt,From: pmetzger@snark.shearson.com (Perry E. Me...
...,...,...
18823,comp.windows.x,From: dev@hollywood.acsc.com ()\nSubject: Circ...
18824,comp.windows.x,From: jra@wti.com (Jim Atkinson)\nSubject: How...
18825,comp.windows.x,From: dealy@narya.gsfc.nasa.gov (Brian Dealy -...
18826,comp.windows.x,From: vinod@sommerfeld.WPI.EDU (Vinod K Nair)\...


In [11]:
MULTIDOMAIN_DATAFRAME

Unnamed: 0,document,label
0,i movie_could movies_i in_only minutes_and bor...,negative
1,your by_disney many_drug can't_even classic_ru...,negative
2,old complicated fun_to moves breaking we_tried...,negative
3,enjoy_what find_that add_some and_when add sum...,negative
4,holes movie_however shooting_fish not_sure lat...,negative
...,...,...
27672,akin go barely mornings_because annoyed i_like...,positive
27673,vibrant anyone well a_rather overheated relate...,positive
27674,very_hefty you'd put_it for_chicken cut when_y...,positive
27675,am_very flatware_set needless_to &amp;quot;non...,positive


# 3. Section marker

In [12]:
import re

In [13]:
def clean_20_news(text):
    # Remove common email headers
    text = re.sub(r'^(From|Subject|Lines|Organization|Reply-To|NNTP-Posting-Host|Keywords|Summary):.*$', '', text, flags = re.MULTILINE)

    # Remove email addresses and URLs
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove lines of signatures or separators
    text = re.sub(r'--+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'_+', '', text)

    # Remove quoted lines (beginning with > or :)
    text = re.sub(r'(^>.*$|^:.*$)', '', text, flags=re.MULTILINE)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [14]:
def clean_multidomain(text):
    # Replace underscores with spaces
    text = text.replace("_", " ")

    # Remove special tokens like <num>
    text = re.sub(r"<num>", "", text)

    # Remove multiple spaces

    return text.strip()

In [15]:
NEWS_DATAFRAME["document_clean"] = NEWS_DATAFRAME["document"].apply(clean_20_news)
MULTIDOMAIN_DATAFRAME["document_clean"] = MULTIDOMAIN_DATAFRAME["document"].apply(clean_multidomain)

In [16]:
NEWS_DATAFRAME

Unnamed: 0,label,document,document_clean
0,sci.crypt,From: tcmay@netcom.com (Timothy C. May)\nSubje...,David Sternlight wrote: ...cascades elided to ...
1,sci.crypt,"From: ""Jon \\lnes"" <jon@ifi.uio.no>\nSubject: ...",acceptance of the wiretap chip) In article (Da...
2,sci.crypt,From: hooper@ccs.QueensU.CA (Andy Hooper)\nSub...,Isn't Clipper a trademark of Fairchild Semicon...
3,sci.crypt,From: warlord@MIT.EDU (Derek Atkins)\nSubject:...,-----BEGIN PGP SIGNED MESSAGE I find this a ve...
4,sci.crypt,From: pmetzger@snark.shearson.com (Perry E. Me...,(Stephen R. Tate) writes: Even if they somehow...
...,...,...,...
18823,comp.windows.x,From: dev@hollywood.acsc.com ()\nSubject: Circ...,Will there be any support for round or circula...
18824,comp.windows.x,From: jra@wti.com (Jim Atkinson)\nSubject: How...,I am trying to find out if my application is r...
18825,comp.windows.x,From: dealy@narya.gsfc.nasa.gov (Brian Dealy -...,The Only directory I know of that lists commer...
18826,comp.windows.x,From: vinod@sommerfeld.WPI.EDU (Vinod K Nair)\...,"Hello, I am writing a program which forks of a..."


In [17]:
MULTIDOMAIN_DATAFRAME

Unnamed: 0,document,label,document_clean
0,i movie_could movies_i in_only minutes_and bor...,negative,i movie could movies i in only minutes and bor...
1,your by_disney many_drug can't_even classic_ru...,negative,your by disney many drug can't even classic ru...
2,old complicated fun_to moves breaking we_tried...,negative,old complicated fun to moves breaking we tried...
3,enjoy_what find_that add_some and_when add sum...,negative,enjoy what find that add some and when add sum...
4,holes movie_however shooting_fish not_sure lat...,negative,holes movie however shooting fish not sure lat...
...,...,...,...
27672,akin go barely mornings_because annoyed i_like...,positive,akin go barely mornings because annoyed i like...
27673,vibrant anyone well a_rather overheated relate...,positive,vibrant anyone well a rather overheated relate...
27674,very_hefty you'd put_it for_chicken cut when_y...,positive,very hefty you'd put it for chicken cut when y...
27675,am_very flatware_set needless_to &amp;quot;non...,positive,am very flatware set needless to &amp;quot;non...


# 4. Section Marker

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
label_encoder_news = LabelEncoder()
label_encoder_news.fit(NEWS_DATAFRAME["label"])

In [20]:
NEWS_DATAFRAME["label_id"] = label_encoder_news.transform(NEWS_DATAFRAME["label"])

In [21]:
label_encoder_multidomain = LabelEncoder()
label_encoder_multidomain.fit(MULTIDOMAIN_DATAFRAME["label"])

In [22]:
MULTIDOMAIN_DATAFRAME["label_id"] = label_encoder_multidomain.transform(MULTIDOMAIN_DATAFRAME["label"])

# 5. Section Marker

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
RANDOM_SEED = 42

In [25]:
def split_dataframe(dataframe):
    
	X = dataframe["document_clean"]
	y = dataframe["label_id"]

	# 60% for training, 40% for second split
	X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state = RANDOM_SEED, stratify = y)

	# 10% for validation, 30% for test
	X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.75, random_state = RANDOM_SEED, stratify = y_temp)

	train_df = X_train.to_frame("text")
	train_df["original_text"] = X_train.values
	train_df["label"] = y_train.values

	val_df = X_val.to_frame("text")
	val_df["original_text"] = X_val.values
	val_df["label"] = y_val.values

	test_df = X_test.to_frame("text")
	test_df["original_text"] = X_test.values
	test_df["label"] = y_test.values

	print(f"Train size: {len(train_df)}")
	print(f"Val size:   {len(val_df)}")
	print(f"Test size:  {len(test_df)}")

	return train_df, val_df, test_df

In [26]:
train_df_news, val_df_news, test_df_news = split_dataframe(NEWS_DATAFRAME)

Train size: 11296
Val size:   1883
Test size:  5649


In [27]:
train_df_multidomain, val_df_multidomain, test_df_multidomain = split_dataframe(MULTIDOMAIN_DATAFRAME)

Train size: 16606
Val size:   2767
Test size:  8304


# 6. Section Marker

In [28]:
train_news = train_df_news.copy()
val_news = val_df_news.copy()
test_news = test_df_news.copy()

In [29]:
train_multidomain = train_df_multidomain.copy()
val_multidomain = val_df_multidomain.copy()
test_multidomain = test_df_multidomain.copy()

# 7. Section Marker

In [30]:
from transformers import AutoTokenizer

In [31]:
bert_base_uncased_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)

In [32]:
train_news["text"] = train_news["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))
val_news["text"] = val_news["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))
test_news["text"] = test_news["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))

In [33]:
train_multidomain["text"] = train_multidomain["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))
val_multidomain["text"] = val_multidomain["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))
test_multidomain["text"] = test_multidomain["text"].apply(lambda x: bert_base_uncased_tokenizer(x, padding = 'max_length', truncation = True, max_length = 512, return_tensors = 'pt'))

# 8. Section Marker

In [34]:
from torch.utils.data import Dataset, DataLoader
import torch

In [35]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [36]:
class CustomDataset(Dataset):
    
	def __init__(self, data):
		self.x = torch.tensor(data["text"].apply(lambda x: x["input_ids"].tolist()[0]).tolist()).to(device)
		self.y = torch.tensor(data['label'].values).to(device)
	

	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]


	def __len__(self):
		return len(self.x)

In [37]:
train_news_dataset = CustomDataset(train_news)
val_news_dataset = CustomDataset(val_news)
test_news_dataset = CustomDataset(test_news)

train_news_loader = DataLoader(train_news_dataset, batch_size = 32, shuffle = True)
val_news_loader = DataLoader(val_news_dataset, batch_size = 32, shuffle = True)
test_news_loader = DataLoader(test_news_dataset, batch_size = 32, shuffle = True)

In [38]:
train_multidomain_dataset = CustomDataset(train_multidomain)
val_multidomain_dataset = CustomDataset(val_multidomain)
test_multidomain_dataset = CustomDataset(test_multidomain)

train_multidomain_loader = DataLoader(train_multidomain_dataset, batch_size = 32, shuffle = True)
val_multidomain_loader = DataLoader(val_multidomain_dataset, batch_size = 32, shuffle = True)
test_multidomain_loader = DataLoader(test_multidomain_dataset, batch_size = 32, shuffle = True)