In [2]:
import torch
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
import text_cleaner as tc
# Load your data into a pandas dataframe
df = pd.read_csv("./buckets_dataset_5k.csv")  # replace with the path to your data file
for row in range(df.shape[0]):
    df.loc[row, 'raw_text'] = tc.raw_content(df.loc[row, 'title'], None, df.loc[row, 'description'], None)
# Extract input and target data
X = df["raw_text"].values  # input data
# y_labels = df.drop("text", axis=1).values.tolist()  # target labels as list of lists
y_labels = []
for i in range(len(df)):
    labels = []
    if not pd.isna(df['b1'][i]) and df['b1'][i] != 'undefined':
        labels.append(df['b1'][i])
    if not pd.isna(df['b2'][i]) and df['b2'][i] != 'undefined':
        labels.append(df['b2'][i])
    if not pd.isna(df['b3'][i]) and df['b3'][i] != 'undefined':
        labels.append(df['b3'][i])
    y_labels.append(labels)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_labels)  # target data as binary matrix

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

def embed(texts, model_name='bert-base-uncased'):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Add a padding token to the tokenizer
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Tokenize your text data and generate document embeddings using BERT
    embeddings = []
    for text in texts:
        encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)[0]
            doc_embedding = model_output.mean(dim=1).squeeze().tolist()
            embeddings.append(doc_embedding)

    return embeddings



In [7]:
X_embeddings = embed(X)
# Split your data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit a MultiOutputClassifier with a LinearSVC estimator on your training data
model = MultiOutputClassifier(LinearSVC())
model.fit(X_train, y_train)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
y_val_pred = model.predict(X_val)
val_scores = []
for i in range(y_val.shape[1]):
    precision = precision_score(y_val[:, i], y_val_pred[:, i], average='micro')
    recall = recall_score(y_val[:, i], y_val_pred[:, i], average='micro')
    f1 = f1_score(y_val[:, i], y_val_pred[:, i], average='micro')
    accuracy = accuracy_score(y_val[:, i], y_val_pred[:, i])
    val_scores.append([precision, recall, f1, accuracy])
val_scores = pd.DataFrame(val_scores, columns=["precision", "recall", "f1-score", "accuracy"])

# Make predictions on your testing data and calculate evaluation metrics
y_test_pred = model.predict(X_test)
test_scores = []
for i in range(y_test.shape[1]):
    precision = precision_score(y_test[:, i], y_test_pred[:, i], average='micro')
    recall = recall_score(y_test[:, i], y_test_pred[:, i], average='micro')
    f1 = f1_score(y_test[:, i], y_test_pred[:, i], average='micro')
    accuracy = accuracy_score(y_test[:, i], y_test_pred[:, i])
    test_scores.append([precision, recall, f1, accuracy])
    
test_scores = pd.DataFrame(test_scores, columns=["precision", "recall", "f1-score", "accuracy"])

test_scores

Unnamed: 0,precision,recall,f1-score,accuracy
0,0.879686,0.879686,0.879686,0.879686
1,0.829119,0.829119,0.829119,0.829119
2,0.915432,0.915432,0.915432,0.915432
3,0.904098,0.904098,0.904098,0.904098
4,0.96687,0.96687,0.96687,0.96687
5,0.896251,0.896251,0.896251,0.896251
6,1.0,1.0,1.0,1.0
7,0.690497,0.690497,0.690497,0.690497
8,0.912816,0.912816,0.912816,0.912816
9,0.737576,0.737576,0.737576,0.737576


In [9]:
#Check each class represents
label=mlb.classes_
i=0
for l in label:
    print(i, l)
    i = i+1

0 Accessories
1 ActiveWear
2 Beachwear
3 Bohemian
4 BusinessCasual
5 Casual
6 Coaster
7 StreetWear
8 Swimwear
9 Trendy
10 Vintage


In [13]:
def predict_labels(text):
    # Generate embedding for input text
    embedding = embed(text)

    # Predict labels using the trained model
    y_pred = model.predict(embedding)
    
    # Convert predicted binary matrix to label strings
    predicted_labels = mlb.inverse_transform(y_pred)
    
    # Sort predicted labels in descending order based on predicted values
    sorted_labels = sorted(zip(predicted_labels[0], y_pred[0]), key=lambda x: x[1], reverse=True)
    
    # Return top 3 label strings
    top_labels = [label for label, value in sorted_labels[:3]]
    
    return top_labels

In [14]:
clothing_descriptions = [    
    "A cozy knit sweater in a rich shade of burgundy.",   
    "A sleek black leather jacket with silver hardware.",    
    "A flowy midi dress in a colorful floral print.",    
    "A classic white button-up shirt with a tailored fit.",    
    "A trendy pair of high-waisted wide leg pants.",    
    "A comfortable cotton t-shirt with a vintage graphic print.",    
    "A chic camel-colored trench coat for a timeless look.",    
    "A bold statement coat in a vibrant shade of cobalt blue.",    
    "A versatile denim jacket with distressed details.",    
    "A sophisticated silk blouse with a subtle print."
]
for description in clothing_descriptions:
    labels = predict_labels(description)
    print(f"Labels for '{description}': {labels}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A cozy knit sweater in a rich shade of burgundy.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A sleek black leather jacket with silver hardware.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A flowy midi dress in a colorful floral print.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A classic white button-up shirt with a tailored fit.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A trendy pair of high-waisted wide leg pants.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A comfortable cotton t-shirt with a vintage graphic print.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A chic camel-colored trench coat for a timeless look.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A bold statement coat in a vibrant shade of cobalt blue.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A versatile denim jacket with distressed details.': ['Accessories', 'Casual', 'Trendy']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels for 'A sophisticated silk blouse with a subtle print.': ['Accessories', 'Casual', 'Trendy']


In [2]:
from transformers import BertModel, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define a list of text data to embed
text_data = ['This is a sentence.', 'Another sentence.', 'Yet another sentence.']

# Tokenize the text data and convert to tensors
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in text_data]
print(tokenized_texts)
input_ids = torch.tensor(tokenized_texts)

# Generate BERT embeddings for the input tokens
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs[0][:, 0, :].numpy()

# Print the resulting embeddings
print(embeddings)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[101, 2023, 2003, 1037, 6251, 1012, 102], [101, 2178, 6251, 1012, 102], [101, 2664, 2178, 6251, 1012, 102]]


ValueError: expected sequence of length 7 at dim 1 (got 5)

In [None]:
import fasttext
# only for windows not macOS
# Load the pre-trained FastText model
model_path = 'cc.en.300.bin'  # path to the pre-trained model
model = fasttext.load_model(model_path)

# Define a list of text data to embed
text_data = ['This is a sentence.', 'Another sentence.', 'Yet another sentence.']

# Embed the text data using the FastText model
embeddings = [model.get_sentence_vector(text) for text in text_data]

# Print the resulting embeddings
print(embeddings)
