# 4. Data Modeling
* [4 Training Data](#2_Data_training_introduction)
  * [4.1 Dummy Variables/One Hot Encoding for Categorical](#3.1_one_hot_encoding)
  * [4.2 Standardize Numerical Data](#3.2_standardize)
  * [4.3 Testing Training](#3.3_testing_training)
 * [4.2 Summary](#3.7_Summary)

In [1]:
# import sys
# sys.executable

# !/opt/homebrew/anaconda3/bin/python -m pip install xgboost fasttext nltk spacy

In [2]:
%store -r products
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test
#print(products) 
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2661 entries, 9454 to 4138
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Brand           2661 non-null   category
 1   Description     2661 non-null   string  
 2   Keyword         2661 non-null   object  
 3   UPC             2661 non-null   object  
 4   MSRP            2661 non-null   float64 
 5   Quantity        2661 non-null   int64   
 6   SKU             2661 non-null   object  
 7   Color           2661 non-null   string  
 8   Size            2661 non-null   string  
 9   StyleNumber     2661 non-null   object  
 10  StyleName       2661 non-null   object  
 11  ParentCategory  2661 non-null   string  
dtypes: category(1), float64(1), int64(1), object(5), string(4)
memory usage: 259.3+ KB
None


  from pandas.core import (


Resources: 
- [blog post](https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d)
- [textcategorizer](https://spacy.io/api/textcategorizer)
- [doc2vec](https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py)
- [vec regression](https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4)

In [4]:
from gensim.models.doc2vec import TaggedDocument
# Example creation of a Doc2Vec model instance
from gensim.models.doc2vec import Doc2Vec

doc2vec_model = Doc2Vec(vector_size=100, min_count=2, epochs=40)

tokenized_descriptions = [description.split() for description in products['Description']]

# Convert each document (product description) to a TaggedDocument object
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_descriptions)]

# Now tagged_data contains a list of TaggedDocument objects

# Build vocabulary from the tagged data
doc2vec_model.build_vocab(tagged_data)

# Train the Doc2Vec model
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import numpy as np
import gensim
from nltk.corpus import stopwords
from xgboost import XGBClassifier

# Load data
data = products

# Define a list of stopwords
stop_words = stopwords.words('english')

# Define a CountVectorizer instance with the list of stopwords for BoW
vectorizer_bow = CountVectorizer(stop_words=stop_words)

# Define a TfidfVectorizer instance with the list of stopwords for TF-IDF
vectorizer_tfidf = TfidfVectorizer(stop_words=stop_words)

# Define an OneHotEncoder instance for encoding categorical features
encoder = OneHotEncoder(sparse_output=False)

# Text preprocessing for 'Description' using BoW
description_bow = vectorizer_bow.fit_transform(data['Description'])

# Text preprocessing for 'Description' using TF-IDF
description_tfidf = vectorizer_tfidf.fit_transform(data['Description'])

# Encode categorical features 'Brand' and 'Category'
encoded_brand = encoder.fit_transform(data[['Brand']])
encoded_category = encoder.fit_transform(data[['Category']])

# Combine features for BoW
combined_features_bow = np.hstack((description_bow.toarray(), encoded_brand, encoded_category))

# Combine features for TF-IDF
combined_features_tfidf = np.hstack((description_tfidf.toarray(), encoded_brand, encoded_category))

# Split data into train and test sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(combined_features_bow, data['Category'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(combined_features_tfidf, data['Category'], test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Doc2Vec": gensim.models.Doc2Vec(vector_size=100, min_count=2, epochs=40),
    # "FastText": fasttext.train_supervised(input=description_tfidf, epoch=25, wordNgrams=2),
    "XGBoost": XGBClassifier()
}

# Evaluate each model
for name, model in models.items():
    if name == "Doc2Vec":
        # Train Doc2Vec model
        model.build_vocab(data['Description'])
        X_train_doc2vec = [model.infer_vector(doc.split()) for doc in data['Description'][:int(0.8 * len(data))]]
        X_test_doc2vec = [model.infer_vector(doc.split()) for doc in data['Description'][int(0.8 * len(data)):]]

        # Train the model
        model.fit(X_train_doc2vec, y_train)

        # Make predictions
        y_pred = model.predict(X_test_doc2vec)
    # elif name == "FastText":
    #     # Train FastText model
    #     model.fit(X_train_tfidf, y_train)

    #     # Make predictions
    #     y_pred = model.predict(X_test_tfidf)
    else:
        # Train other models
        if "Naive Bayes" in name:
            model.fit(X_train_tfidf, y_train)
            y_pred = model.predict(X_test_tfidf)
        else:
            model.fit(X_train_bow, y_train)
            y_pred = model.predict(X_test_bow)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate precision
    precision = precision_score(y_test, y_pred, average='weighted')

    # Print results
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{cm}\n")


  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regression
Accuracy: 0.99
Precision: 0.98
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Decision Tree
Accuracy: 0.99
Precision: 0.99
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]

Model: Naive Bayes
Accuracy: 0.92
Precision: 0.88
Confusion Matrix:
[[ 1  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]



  _warn_prf(average, modifier, msg_start, len(result))


Model: Random Forest
Accuracy: 0.99
Precision: 0.98
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]

Model: Support Vector Machine
Accuracy: 0.98
Precision: 0.97
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]



  _warn_prf(average, modifier, msg_start, len(result))


AttributeError: 'str' object has no attribute 'words'

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import numpy as np
import gensim
import spacy
print("loaded")

loaded


In [10]:
# Load data
data = products

# Define a list of stopwords
stop_words = stopwords.words('english')

# Bag of Words (BoW) method
vectorizer_bow = CountVectorizer(stop_words=stop_words)
description_bow = vectorizer_bow.fit_transform(data['Description'])

# TF-IDF (Term Frequency-Inverse Document Frequency) method
vectorizer_tfidf = TfidfVectorizer(stop_words=stop_words)
description_tfidf = vectorizer_tfidf.fit_transform(data['Description'])

# Word Embeddings (Word2Vec) method
word2vec_model = gensim.models.Word2Vec(sentences=data['Description'], vector_size=100, window=5, min_count=1, workers=4)

word_embeddings = []
for word in data['Description']:
    if word in word2vec_model.wv:
        word_embeddings.append(word2vec_model.wv[word])
    else:
        word_embeddings.append(np.zeros(100))  # If word not present, use zero vector

# Doc2Vec method
# doc2vec_model = gensim.models.Doc2Vec(vector_size=100, min_count=2, epochs=40)
# doc2vec_model.build_vocab(data['Description'])
# doc2vec_model.train(data['Description'], total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# TextCategorizer (spaCy) method
nlp = spacy.load("en_core_web_sm")

# Add the text categorizer component to the pipeline using its string name
nlp.add_pipe('textcat')

# Machine Learning Classification Algorithms
X_train, X_test, y_train, y_test = train_test_split(data['Description'], data['Category'], test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

# Evaluate each model
for name, model in models.items():
    if "Naive Bayes" in name:
        model.fit(vectorizer_tfidf.transform(X_train), y_train)
        y_pred = model.predict(vectorizer_tfidf.transform(X_test))
    else:
        model.fit(vectorizer_bow.transform(X_train), y_train)
        y_pred = model.predict(vectorizer_bow.transform(X_test))

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate precision
    precision = precision_score(y_test, y_pred, average='weighted')

    # Print results
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{cm}\n")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regression
Accuracy: 0.93
Precision: 0.92
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 2  3  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 16  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]

Model: Decision Tree
Accuracy: 0.92
Precision: 0.93
Confusion Matrix:
[[ 1  0  0 ...  0  0  0]
 [ 2  3  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 16  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]

Model: Naive Bayes
Accuracy: 0.75
Precision: 0.72
Confusion Matrix:
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ... 16  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  0]]



  _warn_prf(average, modifier, msg_start, len(result))


Model: Random Forest
Accuracy: 0.95
Precision: 0.95
Confusion Matrix:
[[ 2  0  0 ...  0  0  0]
 [ 2  3  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]

Model: Support Vector Machine
Accuracy: 0.92
Precision: 0.90
Confusion Matrix:
[[ 1  0  0 ...  0  0  0]
 [ 2  3  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 16  0  0]
 [ 0  0  0 ...  0  4  0]
 [ 0  0  0 ...  0  0  1]]



  _warn_prf(average, modifier, msg_start, len(result))
