In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


In [2]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

# === STEP 1: Load Dataset ===
# Update with actual path
df = pd.read_csv("../Dataset/processed_data/final_dataset.csv")
df.dropna(inplace=True)  # Remove missing values

In [4]:

# === STEP 2: Text Preprocessing ===
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans(
        '', '', string.punctuation))  # Remove punctuation
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word)
              for word in tokens if word not in stop_words]
    return " ".join(tokens)


df['processed_text'] = df['description'].apply(preprocess_text)


In [5]:

# === STEP 3: Encode Labels (Domain & Subdomain) ===
domain_encoder = LabelEncoder()
subdomain_encoder = LabelEncoder()

df['domain_encoded'] = domain_encoder.fit_transform(df['domain'])
df['subdomain_encoded'] = subdomain_encoder.fit_transform(df['sub_domain'])


In [6]:

# === STEP 4: Convert Text to Features ===
vectorizer = TfidfVectorizer(max_features=5000)  # Keep only top 5000 words
X = vectorizer.fit_transform(df['processed_text'])


In [22]:

# === STEP 5: Split Data into Train/Test ===
from sklearn.preprocessing import MultiLabelBinarizer
X_train, X_test, y_train, y_test = train_test_split(
    X, df[['domain_encoded', 'subdomain_encoded']], test_size=0.2, random_state=42)

y_train, y_test = y_train.values, y_test.values

# If y_train/y_test is a list of sets (e.g., [{'tech', 'finance'}, {'sports'}])
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [23]:

# === STEP 6: Train Multi-Output SVM Model ===
svm_model = SVC(kernel='linear', C=1.0)
# Disable parallelization
multi_output_model = MultiOutputClassifier(svm_model, n_jobs=1)
multi_output_model.fit(X_train, y_train)

In [24]:

from sklearn.metrics import hamming_loss

# === STEP 7: Evaluate Model ===
y_pred = multi_output_model.predict(X_test)

print("Hamming Loss:", hamming_loss(y_test, y_pred))

Hamming Loss: 0.010292676412739154


In [33]:


# === STEP 8: Make Predictions on New Text ===


def predict_domain_and_subdomain(text):
    text = preprocess_text(text)
    text_vec = vectorizer.transform([text])

    # Get predictions
    # This returns a 2D array of shape (1, 2)
    predictions = multi_output_model.predict(text_vec)

    # Extract domain and subdomain correctly
    domain_pred = predictions[0][0]  # First label (Domain)
    subdomain_pred = predictions[0][1]  # Second label (Subdomain)

    # Convert back to original labels
    domain = domain_encoder.inverse_transform([domain_pred])[0]
    subdomain = subdomain_encoder.inverse_transform([subdomain_pred])[0]

    return domain, subdomain


# Example Usage:
new_text = '''✅Modern Chesterfield Design: Designed with the traditional modern Chesterfield style in mind decorative elements, scrolled arms, and fashionable legs, this piece has all the important features of the classic modern Chesterfield style.
✅Button Tufted Diamonds Stitch: The chaise's button-tufted stitching adds an additional level of refinement to its elegant form. The diamond stitch design provides and wooden leg some roughness without compromising comfort. PRODUCT DIMENSION: 220 Lx 90 Dx 78H-(W x D x H).
✅ The seat, back and armrests are thickly padded, which makes the chaise lounge very comfortable, and the sturdy wooden feet contribute to the stability of the construction.
✅ DURABLE DESIGN - A naturally strong frame is wrapped in supportive foam padding and durable polyester fabric, it has a maximum weight capacity of 498.2 lbs; the cushions are secured to the frame and are not removable.
✅MULTIPURPOSE DESIGN: An attractive, multipurpose design makes it perfect for various spaces such as your living room, college dorm, home office, and more.
✅Our delivery service is very fast, we deliver the order to the customer with 3 days guarantee after dispatch.
✅[Customer Guarantee] We want all of our customers to feel 100% satisfied. If you have any questions, please email us in time, we guarantee to reply within 24 hours and give you a satisfactory reply.'''

domain, subdomain = predict_domain_and_subdomain(new_text)
print("Predicted Domain:", domain)
print("Predicted Subdomain:", subdomain)

Predicted Domain: Medical
Predicted Subdomain: Allergy & Immunology


In [28]:
import joblib

# === STEP 1: Save the model and required components ===


def save_model(model, vectorizer, domain_encoder, subdomain_encoder, model_filename, vectorizer_filename, domain_encoder_filename, subdomain_encoder_filename):
    # Save the trained model
    joblib.dump(model, model_filename)
    # Save the TfidfVectorizer
    joblib.dump(vectorizer, vectorizer_filename)
    # Save the LabelEncoders
    joblib.dump(domain_encoder, domain_encoder_filename)
    joblib.dump(subdomain_encoder, subdomain_encoder_filename)
    print("Model and components saved successfully!")


# Example usage
save_model(multi_output_model, vectorizer, domain_encoder, subdomain_encoder,
           '../models/svm/svm_domain_subdomain_model.joblib',
           '../models/svm/tfidf_vectorizer.joblib',
           '../models/svm/domain_encoder.joblib',
           '../models/svm/subdomain_encoder.joblib')

Model and components saved successfully!


In [None]:
# === STEP 2: Load the saved model and components ===
def load_model(model_filename, vectorizer_filename, domain_encoder_filename, subdomain_encoder_filename):
    # Load the trained model
    model = joblib.load(model_filename)
    # Load the TfidfVectorizer
    vectorizer = joblib.load(vectorizer_filename)
    # Load the LabelEncoders
    domain_encoder = joblib.load(domain_encoder_filename)
    subdomain_encoder = joblib.load(subdomain_encoder_filename)
    print("Model and components loaded successfully!")
    return model, vectorizer, domain_encoder, subdomain_encoder


# Example usage
loaded_model, loaded_vectorizer, loaded_domain_encoder, loaded_subdomain_encoder = load_model(
    '../models/svm/svm_domain_subdomain_model.joblib',
    '../models/svm/tfidf_vectorizer.joblib',
    '../models/svm/domain_encoder.joblib',
    '../models/svm/subdomain_encoder.joblib')