In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer

In [2]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jayes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:

# Load dataset
df = pd.read_csv("../Dataset/processed_data/final_dataset.csv")  # Ensure the dataset exists


In [4]:

# Combine domain and subdomain into a single label (multi-label classification)
df["labels"] = df["domain"] + " | " + df["sub_domain"]


In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


# Text Preprocessing Function

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans(
        '', '', string.punctuation))  # Remove punctuation
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word)
              for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [6]:

# Apply text preprocessing
df["cleaned_description"] = df["description"].apply(preprocess_text)


In [7]:

# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary size
X = vectorizer.fit_transform(df["cleaned_description"])


In [8]:

# Encode labels (convert categorical labels into numbers)
y = df["labels"]


In [9]:

# Convert labels into numerical format
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [10]:

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


### Logistic Regression

In [11]:

# Train Logistic Regression Model
model = LogisticRegression(max_iter=500)


In [None]:
model.fit(X_train, y_train)


In [14]:

# Predictions
y_pred = model.predict(X_test)


In [15]:

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Model Accuracy: 0.7738
Classification Report:
                                     precision    recall  f1-score   support

                 Ecommerce | Books       0.88      0.88      0.88      2380
Ecommerce | Clothing & Accessories       0.97      0.97      0.97      1748
           Ecommerce | Electronics       0.95      0.93      0.94      2185
             Ecommerce | Household       0.94      0.97      0.95      3884
    Medical | Allergy & Immunology       0.00      0.00      0.00         3
          Medical | Anesthesiology       1.00      0.30      0.46        10
              Medical | Cardiology       0.80      0.36      0.50        11
             Medical | Dermatology       0.00      0.00      0.00        10
                     Medical | ENT       0.00      0.00      0.00         1
      Medical | Emergency Medicine       0.50      0.25      0.33         8
           Medical | Endocrinology       0.00      0.00      0.00         8
         Medical | General Surgery      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:

# Testing the model with a new description
def predict_domain(text):
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    predicted_label = model.predict(vectorized_text)
    return label_encoder.inverse_transform(predicted_label)[0]


In [25]:

# Example
sample_text = """Modern Chesterfield Design: Designed with the traditional modern Chesterfield style in mind decorative elements, scrolled arms, and fashionable legs, this piece has all the important features of the classic modern Chesterfield style.
Button Tufted Diamonds Stitch: The chaise's button-tufted stitching adds an additional level of refinement to its elegant form. The diamond stitch design provides and wooden leg some roughness without compromising comfort. PRODUCT DIMENSION: 220 Lx 90 Dx 78H-(W x D x H).
The seat, back and armrests are thickly padded, which makes the chaise lounge very comfortable, and the sturdy wooden feet contribute to the stability of the construction.
DURABLE DESIGN - A naturally strong frame is wrapped in supportive foam padding and durable polyester fabric, it has a maximum weight capacity of 498.2 lbs; the cushions are secured to the frame and are not removable.
MULTIPURPOSE DESIGN: An attractive, multipurpose design makes it perfect for various spaces such as your living room, college dorm, home office, and more.
Our delivery service is very fast, we deliver the order to the customer with 3 days guarantee after dispatch.
[Customer Guarantee] We want all of our customers to feel 100% satisfied. If you have any questions, please email us in time, we guarantee to reply within 24 hours and give you a satisfactory reply."""
print("Predicted Domain:", predict_domain(sample_text))

Predicted Domain: Ecommerce | Household


In [27]:
import joblib

# Save the trained model
joblib.dump(model, "../models/logistic_regression/logistic_regression_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "../models/logistic_regression/tfidf_vectorizer.pkl")

# Save the Label Encoder
joblib.dump(label_encoder, "../models/logistic_regression/label_encoder.pkl")

print("Model, Vectorizer, and Label Encoder saved successfully!")

Model, Vectorizer, and Label Encoder saved successfully!


In [None]:
# Load the saved model
loaded_model = joblib.load(
    "../models/logistic_regression/logistic_regression_model.pkl")

# Load the TF-IDF vectorizer
loaded_vectorizer = joblib.load(
    "../models/logistic_regression/tfidf_vectorizer.pkl")

# Load the Label Encoder
loaded_label_encoder = joblib.load(
    "../models/logistic_regression/label_encoder.pkl")

# Function to predict domain from new text


def predict_domain(text):
    cleaned_text = preprocess_text(text)  # Apply same preprocessing
    vectorized_text = loaded_vectorizer.transform(
        [cleaned_text])  # Convert text to features
    predicted_label = loaded_model.predict(vectorized_text)  # Predict label
    # Convert back to text
    return loaded_label_encoder.inverse_transform(predicted_label)[0]


# Test the saved model
sample_text = "This article discusses convolutional neural networks and AI."
print("Predicted Domain:", predict_domain(sample_text))

### Hyperparameter tuning

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Define the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization types
    'C': np.logspace(-4, 4, 10),  # Inverse of regularization strength
    # Solvers for optimization
    'solver': ['lbfgs', 'saga', 'liblinear', 'newton-cg'],
    'multi_class': ['auto', 'ovr', 'multinomial'],  # Strategy for multi-class
    'l1_ratio': [0, 0.5, 1]  # Only applicable for 'elasticnet'
}

In [14]:
# Grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='accuracy', n_jobs=-1, verbose=2)


In [15]:

# Run Grid Search
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


KeyboardInterrupt: 

In [None]:

# Best parameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


In [None]:

# Evaluate on test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)

In [None]:
# Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, "../models/logistic_regression/best_logistic_regression.pkl")
print("Best model saved as 'best_logistic_regression.pkl'")