In [2]:
import os
import math
import pandas as pd
import re
from collections import Counter
import kagglehub
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class NaiveBayesClassifier:
	def __init__(self, binary=False):
		self.dict = pd.DataFrame()
		self.documents_count = {}

	def train(self, document, label) :
		all_words = re.findall(r'\w+', document.lower())
  
		word_counts = Counter(all_words)
		
		if label not in self.dict.columns:
			self.dict[label] = 0
		
		for word, count in word_counts.items():
			if word not in self.dict.index:
				self.dict.loc[word] = [0] * len(self.dict.columns)
				self.dict.index.name = 'word'
			self.dict.at[word, label] += count

		if label not in self.documents_count:
			self.documents_count[label] = 1
		else:
			self.documents_count[label] += 1

	def predict(self, document, binary=False):
		best_label, best_score  = None, float("-inf")
		all_words = re.findall(r'\w+', document.lower())
		word_counts = Counter(all_words)

		total_docs = sum(self.documents_count.values())

		for label in self.dict.columns:
			class_prob = math.log(self.documents_count[label] / total_docs)

			total_words_for_label = self.dict[label].sum() + len(self.dict)

			for word, count in word_counts.items():
				if binary and count > 0:
					count = 1

				word_freq = self.dict.at[word, label] if word in self.dict.index else 0
				class_prob += count * math.log((word_freq + 1) / total_words_for_label)
			
			if class_prob > best_score:
				best_score = class_prob
				best_label = label
		return best_label

In [12]:
path = kagglehub.dataset_download("saurabhshahane/ecommerce-text-classification")
print(path)

csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_path = os.path.join(path, csv_files[0])

dataset = pd.read_csv(csv_path, names=['label', 'text'])
dataset = dataset.dropna()
dataset = dataset.sample(frac=1, random_state=42)
print(dataset.head())

dataset_x, dataset_y = dataset.shape
train_cut = int(0.8 * dataset_x)

train_data, test_data = dataset[:train_cut], dataset[train_cut:]
classifier = NaiveBayesClassifier()

for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Training"):
	classifier.train(row['text'], row['label'])

correct_predictions = 0
for index, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Testing"):
    predicted_label = classifier.predict(row['text'])
    #print(f"Predicted: {predicted_label}, Actual: {row['label']}")
    if predicted_label == row['label']:
        correct_predictions += 1
print(f"Accuracy: {correct_predictions / len(test_data) * 100:.2f}%")

/home/giacomo/.cache/kagglehub/datasets/saurabhshahane/ecommerce-text-classification/versions/76
                        label  \
35847  Clothing & Accessories   
13005               Household   
26164                   Books   
38330  Clothing & Accessories   
45344             Electronics   

                                                    text  
35847  BREGEO Men's Cotton Casual Blazer This one but...  
13005  HealthSense Chef-Mate KS 50 Digital Kitchen Sc...  
26164  Think & Grow Rich About the Author NAPOLEON HI...  
38330  ayushicreationa Women's Cotton Sports Padded B...  
45344  BlueRigger High Speed Micro HDMI to HDMI Cable...  


Training: 100%|██████████| 40339/40339 [08:20<00:00, 80.62it/s] 
Testing: 100%|██████████| 10085/10085 [00:21<00:00, 468.00it/s]

Accuracy: 94.94%





In [13]:
correct_predictions_binary = 0
for index, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Testing"):
    predicted_label = classifier.predict(row['text'], binary=True)
    
    if predicted_label == row['label']:
        correct_predictions_binary += 1
print(f"Accuracy: {correct_predictions_binary / len(test_data) * 100:.2f}%")

Testing: 100%|██████████| 10085/10085 [00:19<00:00, 522.14it/s]

Accuracy: 94.89%



