In [16]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
import text_cleaner as tc
# Load your data into a pandas dataframe
df = pd.read_csv("./buckets_dataset_5k.csv")  # replace with the path to your data file
for row in range(df.shape[0]):
    df.loc[row, 'raw_text'] = tc.raw_content(df.loc[row, 'title'], None, df.loc[row, 'description'], None)
# Extract input and target data
X = df["raw_text"].values  # input data
# y_labels = df.drop("text", axis=1).values.tolist()  # target labels as list of lists
y_labels = []
for i in range(len(df)):
    labels = []
    if not pd.isna(df['b1'][i]) and df['b1'][i] != 'undefined':
        labels.append(df['b1'][i])
    if not pd.isna(df['b2'][i]) and df['b2'][i] != 'undefined':
        labels.append(df['b2'][i])
    if not pd.isna(df['b3'][i]) and df['b3'][i] != 'undefined':
        labels.append(df['b3'][i])
    y_labels.append(labels)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_labels)  # target data as binary matrix

In [33]:
df.to_csv("1_temp.csv")

In [17]:
y_labels[0:5]

[['Casual', 'StreetWear'],
 ['Casual', 'ActiveWear'],
 ['Casual', 'Trendy', 'Bohemian'],
 ['Casual', 'ActiveWear'],
 ['Casual', 'ActiveWear', 'StreetWear']]

In [18]:
X[0:5]

array(['Daniella Blouse Coral Whisp Linen Bamboo. Description An unstructured top in cool, breathableBamboo/Linen fabric. The Daniella has a ladylike shape, with a relaxed fit that makes it equally as boho. A box pleat in the centre-front of the top gives extra room where needed, and a slight sharkbite hemline is fluid across high-thigh. Wear the Daniella with slim or loose pants, depending on your preferences. Features -Box pleat in centre-front -Sharkbite hemline -Loose, relaxed fit -Full, 3/4 length sleeves Blue Sky fit guide - true to size. Fabric - 50% Bamboo, 50% Linen',
       'Harvest Moon Plus2. SweetLegs is excited to introduce Harvest Moon SweetLegs to our 2019 leggings collection! Twirl into fall in this limited edition solid colour, perfect for boots and sweater weather! Harvest Moon SweetLegs shine when paired with brown over-the-knee boots, a white ruffle blouse, a royal blue knit cardigan, a wheat coloured panama hat, and a matching Harvest Moon SweetLegs Scrunchie .',


In [19]:
print(y[1], X[1])

[0 1 0 0 0 1 0 0 0 0 0] Harvest Moon Plus2. SweetLegs is excited to introduce Harvest Moon SweetLegs to our 2019 leggings collection! Twirl into fall in this limited edition solid colour, perfect for boots and sweater weather! Harvest Moon SweetLegs shine when paired with brown over-the-knee boots, a white ruffle blouse, a royal blue knit cardigan, a wheat coloured panama hat, and a matching Harvest Moon SweetLegs Scrunchie .


In [31]:

def embed(texts, model_name='gpt2'):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Tokenize text data and generate document embeddings using GPT-2
    embeddings = []
    for text in texts:
        encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)[0]
            doc_embedding = model_output.mean(dim=1).squeeze().tolist()
            embeddings.append(doc_embedding)
    
    return embeddings


In [32]:
# # Load the GPT-2 tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('gpt2')
# model = AutoModel.from_pretrained('gpt2')
# # Set the pad_token for the tokenizer
# tokenizer.pad_token = tokenizer.eos_token
# # Tokenize your text data and generate document embeddings using GPT-2
# X_embeddings = []
# for text in X:
#     encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
#     with torch.no_grad():
#         model_output = model(**encoded_input)[0]
#         embeddings = model_output.mean(dim=1).squeeze().tolist()
#         X_embeddings.append(embeddings)

X_embeddings = embed(X)
# Split your data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit a MultiOutputClassifier with a LinearSVC estimator on your training data
model = MultiOutputClassifier(LinearSVC())
model.fit(X_train, y_train)




In [34]:
# Make predictions on your validation data and calculate evaluation metrics
y_val_pred = model.predict(X_val)
val_scores = []
for i in range(y_val.shape[1]):
    precision = precision_score(y_val[:, i], y_val_pred[:, i], average='micro')
    recall = recall_score(y_val[:, i], y_val_pred[:, i], average='micro')
    f1 = f1_score(y_val[:, i], y_val_pred[:, i], average='micro')
    accuracy = accuracy_score(y_val[:, i], y_val_pred[:, i])
    val_scores.append([precision, recall, f1, accuracy])
val_scores = pd.DataFrame(val_scores, columns=["precision", "recall", "f1-score", "accuracy"])

# Make predictions on your testing data and calculate evaluation metrics
y_test_pred = model.predict(X_test)
test_scores = []
for i in range(y_test.shape[1]):
    precision = precision_score(y_test[:, i], y_test_pred[:, i], average='micro')
    recall = recall_score(y_test[:, i], y_test_pred[:, i], average='micro')
    f1 = f1_score(y_test[:, i], y_test_pred[:, i], average='micro')
    accuracy = accuracy_score(y_test[:, i], y_test_pred[:, i])
    test_scores.append([precision, recall, f1, accuracy])
    
test_scores = pd.DataFrame(test_scores, columns=["precision", "recall", "f1-score", "accuracy"])

test_scores

Unnamed: 0,precision,recall,f1-score,accuracy
0,0.882302,0.882302,0.882302,0.882302
1,0.852659,0.852659,0.852659,0.852659
2,0.921534,0.921534,0.921534,0.921534
3,0.897995,0.897995,0.897995,0.897995
4,0.97646,0.97646,0.97646,0.97646
5,0.889276,0.889276,0.889276,0.889276
6,1.0,1.0,1.0,1.0
7,0.506539,0.506539,0.506539,0.506539
8,0.904969,0.904969,0.904969,0.904969
9,0.653008,0.653008,0.653008,0.653008


In [21]:
#Check each class represents
label=mlb.classes_
i=0
for l in label:
    print(i, l)
    i = i+1

0 Accessories
1 ActiveWear
2 Beachwear
3 Bohemian
4 BusinessCasual
5 Casual
6 Coaster
7 StreetWear
8 Swimwear
9 Trendy
10 Vintage


In [35]:

def predict_labels(text):
    # Generate embedding for input text
    embedding = embed(text)

    # Predict labels using the trained model
    y_pred = model.predict(embedding)
    
    # Convert predicted binary matrix to label strings
    predicted_labels = mlb.inverse_transform(y_pred)
    
    # Sort predicted labels in descending order based on predicted values
    sorted_labels = sorted(zip(predicted_labels[0], y_pred[0]), key=lambda x: x[1], reverse=True)
    
    # Return top 3 label strings
    top_labels = [label for label, value in sorted_labels[:3]]
    
    return top_labels


In [24]:
text = 'Harvest Moon Plus2. SweetLegs is excited to introduce Harvest Moon SweetLegs to our 2019 leggings collection! Twirl into fall in this limited edition solid colour, perfect for boots and sweater weather! Harvest Moon SweetLegs shine when paired with brown over-the-knee boots, a white ruffle blouse, a royal blue knit cardigan, a wheat coloured panama hat, and a matching Harvest Moon SweetLegs Scrunchie .'
top_labels = predict_labels(text)
print(top_labels)

['Casual', 'Swimwear']


In [36]:
clothing_descriptions = [    
    "Cozy knit sweater in a rich shade of burgundy.",   
    "ASleek black leather jacket with silver hardware.",    
    "A flowy midi dress in a colorful floral print.",    
    "A classic white button-up shirt with a tailored fit.",    
    "A trendy pair of high-waisted wide leg pants.",    
    "A comfortable cotton t-shirt with a vintage graphic print.",    
    "A chic camel-colored trench coat for a timeless look.",    
    "A bold statement coat in a vibrant shade of cobalt blue.",    
    "A versatile denim jacket with distressed details.",    
    "A sophisticated silk blouse with a subtle print."
]
for description in clothing_descriptions:
    labels = predict_labels(description)
    print(f"Labels for '{description}': {labels}")


Labels for 'Cozy knit sweater in a rich shade of burgundy.': ['Casual', 'Swimwear']
Labels for 'ASleek black leather jacket with silver hardware.': ['Casual', 'Swimwear']
Labels for 'A flowy midi dress in a colorful floral print.': ['Casual', 'Swimwear']
Labels for 'A classic white button-up shirt with a tailored fit.': ['Casual', 'Swimwear']
Labels for 'A trendy pair of high-waisted wide leg pants.': ['Casual', 'Swimwear']
Labels for 'A comfortable cotton t-shirt with a vintage graphic print.': ['Casual', 'Swimwear']
Labels for 'A chic camel-colored trench coat for a timeless look.': ['Casual', 'Swimwear']
Labels for 'A bold statement coat in a vibrant shade of cobalt blue.': ['Casual', 'Swimwear']
Labels for 'A versatile denim jacket with distressed details.': ['Casual', 'Swimwear']
Labels for 'A sophisticated silk blouse with a subtle print.': ['Casual', 'Swimwear']
