In [2]:
from google.cloud import bigquery
client_bq = bigquery.Client.from_service_account_json("./credentials.json", project='charged-dialect-824')

# Loading data from BQ

In [3]:
def load_bq_data(_sql):
    _df = client_bq.query(_sql).to_dataframe()
    return _df

In [4]:
sql = """
SELECT *
FROM RicardoInterview.product_detection_training_data
"""

df_load = load_bq_data(sql)
print("Number of entries in the dataset: ", len(df_load))
df_load.head()

Number of entries in the dataset:  37567


Unnamed: 0,articleId,title,subtitle,productType
0,-3393892867263864215,Gone Is Gone - Echolocation,Gone Is Gone - Echolocation,cd
1,-2831118468819601923,Ekseption-The Lost Last Concert Tapes...,The Lost Last Concert Tapes (Box-Set),cd
2,6835807414137785977,PORTISHEAD CD-Sammlung 3 CDs #16,Kollektion für Fans der Trip-Hop-Pioniere aus ...,cd
3,-4912458353746230865,Primeon CD-R Rohlinge (bunt),39 Stück auf der Spindel,cd
4,1496864991301777371,Adonia - Zmittst im Füür,Markus Hottiger / Markus Heusser,cd


# To be implemented

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print("Firstly we are going to look at the number of possible class and their distribution")
df_count = df_load["productType"].value_counts()
# Calculate summary statistics
df_count.describe()

Firstly we are going to look at the number of possible class and their distribution


count    383.000000
mean      98.086162
std        5.492059
min       48.000000
25%       99.000000
50%       99.000000
75%       99.000000
max       99.000000
Name: productType, dtype: float64

In [7]:
print("We are going to look at the length of the text we have to train our models")
df_load["length_title"]= df_load["title"].apply(lambda x: len(x))
df_load["length_subtitle"] = df_load["subtitle"].apply(lambda x: len(x))
df_load.sort_values(by="length_title", ascending = False).head()

We are going to look at the length of the text we have to train our models


Unnamed: 0,articleId,title,subtitle,productType,length_title,length_subtitle
27300,-5186182111564884197,Autotransporter / Universaltransporter TPV / B...,,trailer,60,0
8533,-6183040234950825017,Speditionsanhänger mit Hebebühne D'Hollandia m...,schöner Profi Anhänger,trailer,60,22
1211,4782662448248258064,5.5 KR-Seekreuzer Yachtwerft Stäheli CH-8595 A...,Einmalige Gelegenheit für Liebhaber klassische...,boat,60,62
8561,-6437718805549010031,"Kipper, 3-Seitenkipper, 3000kg, Pongratz 3-SKS...",Heckkipper Alukipper M.B. Stahlkipper Wenk MB ...,trailer,60,75
2175,-2063386134032828754,Anhänger für Schneemobil Anhänger Anhänger ...,Garag Jann Graf Trimmis,quad,60,23


As we have very short description with few information in the title and subtitle, we are going to look at different method to encode our textual data. First we will do some small preprocessing on the data and then try to encode it to vector keeping some relations on the words.


## Model 1: CountVectorizer
First, since the titles and subtitles seem to have little semantic relations, only key words, brands in multiple languages we will use CountVectorizer that onvert text into a numerical vector representation based on word frequencies. We will use the results to then train a Random Forest Classifier and look at its accuracy on test data


In [8]:
# First we are going to look into CountVectorizer
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_load[['title', 'subtitle']], df_load['productType'], test_size=0.3, stratify = df_load['productType'], random_state=42)

X_train_combined = X_train['title'] + " " + X_train['subtitle']
X_test_combined = X_test['title'] + " " + X_test['subtitle']

# Convert the preprocessed text into count-based vectors using CountVectorizer
count_vectorizer = CountVectorizer(stop_words="english")
X_train_count = count_vectorizer.fit_transform(X_train_combined)
X_test_count = count_vectorizer.transform(X_test_combined)

In [15]:
# Train a classifier model on the encoded features
classifier = RandomForestClassifier()
classifier.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred_countVect = classifier.predict(X_test_count)

# Evaluate the classifier performance
print(accuracy_score(y_test, y_pred_countVect))

0.6657794339455239


## Model 2: Word2Vec

To try and improve on the previous score we will look at another method rather then using a CountVectorizer. For this we would like to create a Word2Vec model, which creates word embeddings capturing semantic relationships in continuous vector space. From this word embedding we will get some numerical features using TF-IDF: that uses the importance of a word in the text relative to its importance in all text

In [16]:
# Preprocess the text data (lowercase conversion and stopwords removal)
stopwords_set = set(stopwords.words('english'))
german_stopwords_set = set(stopwords.words('german'))
stopwords_set.update(german_stopwords_set)  # Add German stopwords to the set

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords_set])
    return text

X_train_processed = X_train_combined.apply(preprocess_text)
X_test_processed = X_test_combined.apply(preprocess_text)

#Create a word2Vec model with the tokenized text
tokenized_text = [text.split() for text in X_train_processed]
word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1)

# Encode the text features using the trained Word2Vec model
def encode_text(text):
    vector = []
    for word in text.split():
        if word in word2vec_model.wv:
            vector.append(word)
    return " ".join(vector)

X_train_word2vec = X_train_processed.apply(encode_text)
X_test_word2vec = X_test_processed.apply(encode_text)

# Convert the encoded features into numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_word2vec = tfidf_vectorizer.fit_transform(X_train_word2vec)
X_test_tfidf_word2vec = tfidf_vectorizer.transform(X_test_word2vec)



In [12]:
# Train a classifier model on the encoded features
classifier.fit(X_train_tfidf_word2vec, y_train)

# Make predictions on the test set
y_pred_word2vec = classifier.predict(X_test_tfidf_word2vec)

# Evaluate the classifier performance
print(accuracy_score(y_test, y_pred_word2vec))


0.6548664714754681


## Model 3: Using Distilled Bert Multi-lingual model

It seems that for this specific problem semantic of the world do not improve our model, this could come from the fact that the description are short and multi-lingual. We thus could improve on this solution with more pre-processing and looking at putting semantic of different language together with model from spacy for example in a future tasks.


If more computational power and longer training was needed we could use multi-lingual pretrained model such as distilled-bert-mulitlingual to encode the data and train the classifier. For this case code would look as follows

In [None]:
import torch
from transformers import BertTokenizer, BertModel
# Load the pre-trained BERT model and tokenizer
model_name ='distilbert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define batch size for processing in smaller batches
batch_size = 32

# Initialize lists to store encoded embeddings
X_train_embeddings = []
X_test_embeddings = []

# Process data in smaller batches
for i in range(0, len(X_train_combined), batch_size):
    batch_texts = X_train_combined[i : i + batch_size]

    # Tokenize the text using BERT tokenizer
    batch_encoded = []
    for text in batch_texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        batch_encoded.append(tokens)

    # Pad the sequences to a fixed length
    max_length = max(len(tokens) for tokens in batch_encoded)
    batch_padded = [tokens + [0] * (max_length - len(tokens)) for tokens in batch_encoded]

    # Convert the padded sequences to tensors
    batch_tensors = torch.tensor(batch_padded)

    # Use BERT model to encode the text
    with torch.no_grad():
        batch_embeddings = model(batch_tensors)[0][:, 0, :]
        X_train_embeddings.extend(batch_embeddings.tolist())

# Process test data in smaller batches
for i in range(0, len(X_test_combined), batch_size):
    batch_texts = X_test_combined[i : i + batch_size]

    # Tokenize and encode the text using BERT tokenizer
    batch_encoded = []
    for text in batch_texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        batch_encoded.append(tokens)

    # Pad the sequences to a fixed length
    max_length = max(len(tokens) for tokens in batch_encoded)
    batch_padded = [tokens + [0] * (max_length - len(tokens)) for tokens in batch_encoded]

    # Convert the padded sequences to tensors
    batch_tensors = torch.tensor(batch_padded)

    # Use BERT model to encode the text
    with torch.no_grad():
        batch_embeddings = model(batch_tensors)[0][:, 0, :]
        X_test_embeddings.extend(batch_embeddings.tolist())

# Convert the encoded embeddings to tensors
X_train_tensors = torch.tensor(X_train_embeddings)
X_test_tensors = torch.tensor(X_test_embeddings)

# We can then use the same classifier for the prediction

In [23]:
# Save the classification model
with open('models/classifier_model_countVectorizer.joblib', 'wb') as file:
    joblib.dump(classifier, file, compress=('gzip', 3))

# Save the CountVectorizer model
with open("models/count_vectorizer.joblib", "wb") as file:
    joblib.dump(count_vectorizer, file)