In [None]:

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
import re
import string
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity




In [None]:

class TextProcessor:
    def __init__(self, text_data):
        self.text_data = text_data

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(f"[{string.punctuation}0-9]", " ", text)
        tokens = text.split()
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)

    def process(self):
        return self.text_data.apply(self.preprocess_text)


In [None]:


class DataHandler:
    def __init__(self, filepath, target_column='SubCategory', text_column='Description'):
        self.data = pd.read_csv(filepath)
        self.target_column = target_column
        self.text_column = text_column
        self.label_encoder = LabelEncoder()

    def encode_labels(self):
        y = self.data[self.target_column]
        y_encoded = self.label_encoder.fit_transform(y)
        return y_encoded

    def filter_rare_classes(self, min_class_size=2):
        y_encoded = self.encode_labels()
        class_counts = pd.Series(y_encoded).value_counts()
        rare_classes_encoded = class_counts[class_counts < min_class_size].index
        rare_classes = self.label_encoder.inverse_transform(rare_classes_encoded)
        self.data = self.data[~self.data[self.target_column].isin(rare_classes)]

    def get_features_and_labels(self):
        X = self.data[self.text_column]
        y = self.label_encoder.fit_transform(self.data[self.target_column])
        return X, y


class FeatureExtractor:
    def __init__(self, max_features=5000):
        self.vectorizer = TfidfVectorizer(max_features=max_features)

    def fit_transform(self, text_data):
        return self.vectorizer.fit_transform(text_data)


class ModelPipeline:
    def __init__(self, model=LogisticRegression(random_state=42, max_iter=1000)):
        self.model = model

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        return accuracy, conf_matrix



In [None]:

class RecommendationSystem:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.product_embeddings = {}

    def generate_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings

    def create_embeddings(self, products, descriptions):
        self.product_embeddings = {
            product: self.generate_embedding(desc).numpy()
            for product, desc in zip(products, descriptions)
        }

    def recommend(self, product_name, top_n=5):
        if product_name not in self.product_embeddings:
            return f"Product '{product_name}' not found in database."

        query_embedding = self.product_embeddings[product_name]
        similarities = {
            other_product: cosine_similarity(query_embedding, embedding)[0][0]
            for other_product, embedding in self.product_embeddings.items() if other_product != product_name
        }
        sorted_products = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
        return [product for product, _ in sorted_products[:top_n]]




In [None]:
# Pipeline Execution
data_handler = DataHandler(filepath='/content/NLP_Task_Dataset.csv')
data_handler.filter_rare_classes()
X_raw, y = data_handler.get_features_and_labels()

text_processor = TextProcessor(X_raw)
X_processed = text_processor.process()

feature_extractor = FeatureExtractor()
X_tfidf = feature_extractor.fit_transform(X_processed)

if not pd.DataFrame(X_tfidf.toarray()).isna().sum().sum():
    smote = SMOTE(random_state=42, k_neighbors=1)
    X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

    if not pd.DataFrame(X_resampled.toarray()).isna().sum().sum():
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

        model_pipeline = ModelPipeline()
        model_pipeline.train(X_train, y_train)
        accuracy, conf_matrix = model_pipeline.evaluate(X_test, y_test)
        print(f"Accuracy: {accuracy * 100:.2f}%")
        print("Confusion Matrix:")
        print(conf_matrix)

# Recommendations
recommender = RecommendationSystem()
recommender.create_embeddings(data_handler.data['ProductName'], data_handler.data['Description'])
product_name = '"peak lapel tuxedo suit jacket"'
top_similar_products = recommender.recommend(product_name, top_n=5)
print(f"Top 5 products similar to '{product_name}': {top_similar_products}")

Accuracy: 95.61%
Confusion Matrix:
[[58  0  0 ...  0  0  0]
 [ 0 68  1 ...  0  0  0]
 [ 0  0 68 ...  0  0  0]
 ...
 [ 0  0  0 ... 68  0  0]
 [ 0  0  0 ...  0 69  0]
 [ 0  0  0 ...  0  0 69]]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Top 5 products similar to '"peak lapel tuxedo suit jacket"': ['"fitted tuxedo jacket"', '"velvet tuxedo jacket"', '"classic tuxedo jacket"', '"Men\'s Silk Tuxedo Jacket"', '"Men\'s G-Line Two-Piece Tuxedo Suit"']
