In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# 1. TF-IDF

**Algorithm**: TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus).

TF-IDF has a number of applications. It can be used as a weighting factor for:

**Information retrieval**: Variations of TF-IDF are used as a weighting factor by search engines to help understand the relevance of a page to a user’s search query

**Text mining**: TF-IDF can help quantify what a document is about, which is a central question in text mining

**User modeling**: Another application of TF-IDF involves assisting in the creation of models for user behavior and interests, which can then be used by product and content recommendation engines


# Text-Mining : TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
document1 = "It is going to rain today."
document2 = "Today I am not going outside."
document3 = "I am going to watch the season premiere."
documents = [document1, document2, document3]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Get feature names of final words that we will use to tag documents
analyze_text = tfidf_vectorizer.build_analyzer()
print(f"Tokenized words in Document 1: {analyze_text(document1)}")

# Vectorize the documents and find feature names
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print(f"\nTF-IDF Matrix representation of documents:\n{tfidf_matrix.toarray()}")

# Get vocabulary (word indexes)
word_indexes = tfidf_vectorizer.vocabulary_
print(f"\nWord indexes: {word_indexes}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nFeature names (terms): {feature_names}")


Tokenized words in Document 1: ['it', 'is', 'going', 'to', 'rain', 'today']

TF-IDF Matrix representation of documents:
[[0.         0.27824521 0.4711101  0.4711101  0.         0.
  0.         0.4711101  0.         0.         0.35829137 0.35829137
  0.        ]
 [0.40619178 0.31544415 0.         0.         0.53409337 0.53409337
  0.         0.         0.         0.         0.         0.40619178
  0.        ]
 [0.32412354 0.25171084 0.         0.         0.         0.
  0.4261835  0.         0.4261835  0.4261835  0.32412354 0.
  0.4261835 ]]

Word indexes: {'it': 3, 'is': 2, 'going': 1, 'to': 10, 'rain': 7, 'today': 11, 'am': 0, 'not': 4, 'outside': 5, 'watch': 12, 'the': 9, 'season': 8, 'premiere': 6}

Feature names (terms): ['am' 'going' 'is' 'it' 'not' 'outside' 'premiere' 'rain' 'season' 'the'
 'to' 'today' 'watch']


# Information Retrieval : TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Web pages
documents = [
    "TF-IDF is a numerical statistic used to reflect the importance of a word to a document in a collection or corpus.",
    "Search engines often use TF-IDF to determine the relevance of a document to a search query.",
    "The TF-IDF value increases proportionally to the number of times a word appears in the document and is offset by the frequency of the word in the corpus.",
    "In information retrieval, TF-IDF is a common technique used to weigh the importance of words in a document.",
    "A higher TF-IDF score indicates that a word is more important to a document compared to other words.",
]

# User's search query
query = "information retrieval TF-IDF relevance"

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Transform the search query into TF-IDF representation
query_tfidf = tfidf_vectorizer.transform([query])

# Calculate cosine similarity between the query and documents
cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

# Sort documents by relevance
sorted_indices = cosine_similarities.argsort()[::-1]

# Print the most relevant documents
print("Search results:")
for i, idx in enumerate(sorted_indices):
    print(f"Document {i + 1}: Similarity Score: {round(cosine_similarities[idx],2)}, Content: {documents[idx]}")


Search results:
Document 1: Similarity Score: 0.41, Content: In information retrieval, TF-IDF is a common technique used to weigh the importance of words in a document.
Document 2: Similarity Score: 0.22, Content: Search engines often use TF-IDF to determine the relevance of a document to a search query.
Document 3: Similarity Score: 0.08, Content: TF-IDF is a numerical statistic used to reflect the importance of a word to a document in a collection or corpus.
Document 4: Similarity Score: 0.07, Content: A higher TF-IDF score indicates that a word is more important to a document compared to other words.
Document 5: Similarity Score: 0.05, Content: The TF-IDF value increases proportionally to the number of times a word appears in the document and is offset by the frequency of the word in the corpus.


#GloVe (Global Vectors for Word Representation)


**Algorithm**: GloVe is an unsupervised learning algorithm for obtaining vector representations for words. It combines the global statistics of word co-occurrences in a corpus with a neural network model.

https://nlp.stanford.edu/projects/glove/

    Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): glove.6B.zip
    Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): glove.42B.300d.zip
    Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip
    Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download): glove.twitter.27B.zip

**NER**: GloVe embeddings can be used as features in NER systems to identify and classify named entities within text documents. By leveraging the contextual information encoded in GloVe embeddings, NER models can achieve improved accuracy.

**Text Generation**: In text generation tasks, such as language modeling and dialogue generation, GloVe embeddings can be used as input features to generate coherent and contextually relevant text. By leveraging the semantic relationships captured in GloVe embeddings, text generation models can produce more meaningful and fluent output.

**Semantic Similarity**: GloVe embeddings enable computing semantic similarity between words, phrases, or sentences. This is useful in tasks such as duplicate detection, paraphrase identification, and question answering, where understanding the semantic similarity between text units is crucial.

**Recommendation Systems**: GloVe embeddings can be used to represent textual content in recommendation systems, such as content-based filtering. By capturing the semantic meaning of items and user preferences, GloVe embeddings can help improve the accuracy and relevance of recommendations.



# NER : GloVe

In [None]:
import spacy

# Load the spaCy English model with GloVe embeddings
nlp = spacy.load("en_core_web_md")

# Sample text with named entities
text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California. " \
       "It designs, manufactures, and sells consumer electronics, computer software, and online services."

# Process the text using spaCy
doc = nlp(text)

# Print named entities and their labels
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, "-", ent.label_)


Named Entities:
Apple Inc. - ORG
American - NORP
Cupertino - GPE
California - GPE


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
path="/content/drive/MyDrive/NLP/glove.6B.100d.txt"

# Paraphrase Detection Semantic Similarity : GloVe


In [None]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from spacy.matcher import Matcher

# Load SpaCy with GloVe embeddings
nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)

patterns = [
    [{'POS':'ADJ'}, {'POS':'NOUN'}],
    ]
matcher.add("demo", patterns)


# Function to tokenize sentence into phrases
def extract_phrases(sentence):
    doc = nlp(sentence)
    matches = matcher(doc)
    phrases = []
    for match_id,start,end in matches:
      span = doc[start:end]
      phrases.append(span.text)
    phrases.extend(chunk.text for chunk in doc.noun_chunks)
    print(phrases,type(phrases))
    return phrases

# Function to calculate GloVe embedding for a phrase
def phrase_embedding(phrase):
    return np.mean([token.vector for token in nlp(phrase)], axis=0)

# Function to find most similar paraphrase pair
def find_paraphrase(sentence_list):
    max_similarity = 0
    paraphrase_similarity = dict()

    for i in range(len(sentence_list)):
        for j in range(i+1, len(sentence_list)):
            phrases1 = extract_phrases(sentence_list[i])
            phrases2 = extract_phrases(sentence_list[j])

            for phrase1 in phrases1:
                for phrase2 in phrases2:
                    emb1 = phrase_embedding(phrase1)
                    emb2 = phrase_embedding(phrase2)
                    similarity = cosine_similarity([emb1], [emb2])[0][0]

                    paraphrase_pair = (phrase1, phrase2)
                    paraphrase_similarity[paraphrase_pair] = similarity

    paraphrase_similarity = dict(sorted(paraphrase_similarity.items(),
                                        key=lambda item: item[1],reverse=True))
    return paraphrase_similarity

# Example sentences
sentences = [
    "The cat sat on the mat and licked its paws.",
    "A dog chased the ball and barked loudly.",
    "The sun set behind the mountains, casting a golden glow."
]

# Find paraphrase pair and similarity
paraphrase_similarity = find_paraphrase(sentences)

# Print results
# print("Sentences:")
# for i, sentence in enumerate(sentences):
#     print(f"{i+1}. {sentence}")
i = 0
if len(paraphrase_similarity):
    for key,val in paraphrase_similarity.items():
      print(f"Paraphrse : {key} Similarity : {val}")
      i+=1
      if i>5: break
else:
    print("\nNo paraphrase pair found.")


['The cat', 'the mat', 'its paws'] <class 'list'>
['A dog', 'the ball'] <class 'list'>
['The cat', 'the mat', 'its paws'] <class 'list'>
['golden glow', 'The sun', 'the mountains', 'a golden glow'] <class 'list'>
['A dog', 'the ball'] <class 'list'>
['golden glow', 'The sun', 'the mountains', 'a golden glow'] <class 'list'>
Paraphrse : ('the mat', 'the mountains') Similarity : 0.7576532363891602
Paraphrse : ('the mat', 'the ball') Similarity : 0.7441527843475342
Paraphrse : ('the ball', 'the mountains') Similarity : 0.7378790378570557
Paraphrse : ('The cat', 'A dog') Similarity : 0.5905861854553223
Paraphrse : ('The cat', 'The sun') Similarity : 0.5646107196807861
Paraphrse : ('its paws', 'the mountains') Similarity : 0.56061851978302


#Document Clustering : GloVe

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Function to load GloVe embeddings
glove_embeddings = {}
with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = coefs

# Function to calculate document embeddings
def document_embedding(doc, embeddings_index):
    words = doc.split()
    word_embeddings = [embeddings_index.get(word, np.zeros(embeddings_index['a'].shape)) for word in words]
    doc_embedding = np.mean(word_embeddings, axis=0)
    return doc_embedding


# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Calculate document embeddings
document_vectors = [document_embedding(doc, glove_embeddings) for doc in documents]

# Perform K-means clustering
num_clusters = 2  # Number of clusters
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(document_vectors)
clusters = kmeans.labels_

# Print clusters
for i in range(num_clusters):
    cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i]
    print(f'Cluster {i+1}:')
    for doc in cluster_docs:
        print(doc)
    print()

# Optionally, print documents in each cluster
for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    for i, doc in enumerate(documents):
        if clusters[i] == cluster_id:
            print(doc)

# Reduce dimensionality for visualization
pca = PCA(n_components=2)  # 2 principal components for 2D visualization
document_embeddings_2d = pca.fit_transform(document_vectors)

# Plot document clusters
plt.figure(figsize=(4,3))
for i in range(num_clusters):
    cluster_docs = np.array([document_embeddings_2d[j] for j in range(len(documents)) if clusters[j] == i])
    plt.scatter(cluster_docs[:, 0], cluster_docs[:, 1], label=f'Cluster {i+1}')
plt.title('Document Clusters (2D PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


# Debiasing Word Vectors : GloVe

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def read_glove_vecs(path):
  words = set()
  word_to_vec_map = {}
  with open(path,'r', encoding='utf-8') as f:
    for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      words.add(word)
      word_to_vec_map[word] = coefs

  i = 1
  words_to_index = {}
  index_to_words = {}
  for word in words:
    words_to_index[word] = i
    index_to_words[i] = word

  return words,word_to_vec_map,words_to_index, index_to_words,

words, word_to_vec_map,word_to_index, index_to_word = read_glove_vecs(path)

Female first names tend to have a positive cosine similarity with our constructed vector
, while male first names tend to have a negative cosine similarity.

We see “computer” is negative and is closer in value to male first names, while “literature” is positive and is closer to female first names.

In [None]:
g = word_to_vec_map['woman'] - word_to_vec_map['man']
print(f"g shape : {g.shape}")

print ('List of names and their similarities with constructed vector:')

# girls and boys name
name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']

for w in name_list:
    # Reshape vectors to be 2D arrays with a single row
    similarity = cosine_similarity(word_to_vec_map[w].reshape(1, -1), g.reshape(1, -1))[0][0]
    print(w, similarity)

print("="*300)
print ('List of names and their similarities with constructed vector:')

# girls and boys name
name_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist',
             'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
for w in name_list:
    # Reshape vectors to be 2D arrays with a single row
    similarity = cosine_similarity(word_to_vec_map[w].reshape(1, -1), g.reshape(1, -1))[0][0]
    print(w, similarity)

g shape : (100,)
List of names and their similarities with constructed vector:
john -0.22835018
marie 0.2453734
sophie 0.20268358
ronaldo -0.3328965
priya 0.13922855
rahul -0.06390728
danielle 0.14913149
reza -0.081926554
katy 0.18688588
yasmin 0.21136825
List of names and their similarities with constructed vector:
lipstick 0.18037248
guns -0.099644475
science -0.021475762
arts 0.014846751
literature 0.08261855
warrior -0.15634203
doctor 0.10942282
tree -0.0886836
receptionist 0.2806876
technology -0.14474528
fashion 0.080974355
teacher 0.15233697
engineer -0.123000115
pilot -0.041133933
computer -0.115457155
singer 0.113726415



Given an input embedding $e$, you can use the following formulas to compute $e^{debiased}$:

$$e^{bias\_component} = \frac{e \cdot g}{||g||_2^2} * g\tag{2}$$
$$e^{debiased} = e - e^{bias\_component}\tag{3}$$

recognize $e^{bias\_component}$ is the projection of $e$ onto the direction $g$ ensuring that gender neutral words are zero in the gender subspace.

In [None]:
def neutralize(word, g, word_to_vec_map):
    e = word_to_vec_map[word]
    e_biascomponent = np.dot(e, g) / np.square(np.linalg.norm(g, ord=2)) * g
    e_debiased = e - e_biascomponent
    return e_debiased

e = "receptionist"
print("cosine similarity between " + e + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map["receptionist"].reshape(1,-1), g.reshape(1,-1)))

e_debiased = neutralize("receptionist", g, word_to_vec_map)
print("cosine similarity between " + e + " and g, after neutralizing: ", cosine_similarity(e_debiased.reshape(1,-1), g.reshape(1,-1)))

cosine similarity between receptionist and g, before neutralizing:  [[0.2806876]]
cosine similarity between receptionist and g, after neutralizing:  [[-8.1490725e-09]]



The derivation of the linear algebra to do this is a bit more complex. (See Bolukbasi et al., 2016 in the References for details.) Here are the key equations:


$$ \mu = \frac{e_{w1} + e_{w2}}{2}\tag{4}$$

$$ \mu_{B} = \frac {\mu \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{5}$$

$$\mu_{\perp} = \mu - \mu_{B} \tag{6}$$

$$ e_{w1B} = \frac {e_{w1} \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{7}$$
$$ e_{w2B} = \frac {e_{w2} \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{8}$$


$$e_{w1B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w1B}} - \mu_B} {||(e_{w1} - \mu_{\perp}) - \mu_B||_2} \tag{9}$$


$$e_{w2B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w2B}} - \mu_B} {||(e_{w2} - \mu_{\perp}) - \mu_B||_2} \tag{10}$$

$$e_1 = e_{w1B}^{corrected} + \mu_{\perp} \tag{11}$$
$$e_2 = e_{w2B}^{corrected} + \mu_{\perp} \tag{12}$$


In [None]:
def equalize(pair, bias_axis, word_to_vec_map):
    w1, w2 = pair
    e_w1, e_w2 = [word_to_vec_map[w] for w in [w1, w2]]
    mu = np.mean(np.concatenate((e_w1, e_w2), axis=0))
    mu_B = np.dot(mu, bias_axis) / np.square(np.linalg.norm(bias_axis, ord=2)) * bias_axis
    mu_orth = mu - mu_B
    e_w1B = np.dot(e_w1, bias_axis) / np.square(np.linalg.norm(bias_axis, ord=2)) * bias_axis
    e_w2B = np.dot(e_w2, bias_axis) / np.square(np.linalg.norm(bias_axis, ord=2)) * bias_axis
    corrected_e_w1B = np.sqrt(np.abs(1 - np.square(np.linalg.norm(mu_orth)))) * (e_w1B - mu_B) / np.linalg.norm(e_w1 - mu_orth - mu_B)
    corrected_e_w2B = np.sqrt(np.abs(1 - np.square(np.linalg.norm(mu_orth)))) * (e_w2B - mu_B) / np.linalg.norm(e_w2 - mu_orth - mu_B)
    e1 = corrected_e_w1B + mu_orth
    e2 = corrected_e_w2B + mu_orth
    return e1, e2

print("cosine similarities before equalizing:")
print("cosine_similarity(word_to_vec_map[\"man\"], gender) = ", cosine_similarity(word_to_vec_map["man"].reshape(1,-1), g.reshape(1,-1)))
print("cosine_similarity(word_to_vec_map[\"woman\"], gender) = ", cosine_similarity(word_to_vec_map["woman"].reshape(1,-1), g.reshape(1,-1)))
print("="*300)
e1, e2 = equalize(("man", "woman"), g, word_to_vec_map)
print("cosine similarities after equalizing:")
print("cosine_similarity(e1, gender) = ", cosine_similarity(e1.reshape(1,-1), g.reshape(1,-1)))
print("cosine_similarity(e2, gender) = ", cosine_similarity(e2.reshape(1,-1), g.reshape(1,-1)))

cosine similarities before equalizing:
cosine_similarity(word_to_vec_map["man"], gender) =  [[-0.18769068]]
cosine_similarity(word_to_vec_map["woman"], gender) =  [[0.388177]]
cosine similarities after equalizing:
cosine_similarity(e1, gender) =  [[-0.13731493]]
cosine_similarity(e2, gender) =  [[0.55300444]]


# Question-Answer : GloVe

# FastText Embeddings

**Document Clustering**: FastText embeddings facilitate document clustering tasks by capturing the semantic similarity between documents. This enables grouping similar documents together, which is useful in various applications such as organizing news articles, customer reviews, or academic papers.

**Machine Translation**: FastText embeddings are employed in machine translation systems to improve translation quality, especially for translating languages with complex morphology or a high degree of inflection. By capturing subword information, FastText embeddings help handle rare or unseen words effectively.

**Recommendation Systems**: FastText embeddings can be utilized in recommendation systems to model user preferences and item similarities. They enable the system to understand the semantics of user-item interactions, leading to more personalized and accurate recommendations.


In [6]:
!pip install -q fasttext

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/68.8 kB[0m [31m939.4 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m933.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd

df= pd.read_csv("/content/drive/MyDrive/NLP/ecommerce_dataset.csv", names=["category", "description"], header=None)
df.dropna(inplace=True)
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
print(df.shape)
df.head(5)

(50424, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


When you train a fasttext model, it expects labels to be specified with label prefix. We will just create a third column in the dataframe that has label as well as the product description

In [4]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [5]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [8]:
import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text) # remove punctuation,special character
    text = re.sub(' +', ' ', text) # remove extra white space
    text = re.sub(r'[ \n]+', ' ', text) # remove newline - each line should represent each document(fasttext format)
    return text.strip().lower()

df['category_description'] = df['category_description'].map(preprocess)

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . \nhi"
print(preprocess(text))

viki's bookcase bookshelf 3 shelf shelve white hi


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)
train.shape, test.shape

((40339, 3), (10085, 3))

In [10]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [11]:
import fasttext

In [13]:
model = fasttext.train_supervised(input="ecommerce.train")


In [14]:
model.test("ecommerce.test")

(10085, 0.9696579077838374, 0.9696579077838374)

In [15]:
print(model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3"))
print(model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric"))
print(model.predict("think and grow rich deluxe edition"))

(('__label__electronics',), array([0.98976058]))
(('__label__clothing_accessories',), array([1.00001001]))
(('__label__books',), array([1.00000978]))


In [16]:
model.get_nearest_neighbors("painting")

[(0.9985939860343933, '30x30cm'),
 (0.9985939860343933, 'chillum'),
 (0.998585045337677, 'darkpyros'),
 (0.998585045337677, 'primium'),
 (0.9985846281051636, 'hpk'),
 (0.9985840320587158, 'phm'),
 (0.9985836744308472, 'castor'),
 (0.9985835552215576, 'seemingly'),
 (0.998580276966095, "griiham's"),
 (0.9985737204551697, 'exhibition')]

In [17]:
model.get_nearest_neighbors("apple")

[(0.9971563816070557, 'ecommerce'),
 (0.9971433281898499, '50x'),
 (0.9968802332878113, 'luscious'),
 (0.9965807199478149, 'acceleration'),
 (0.9965444207191467, 'eachbid'),
 (0.9962717890739441, 'fw'),
 (0.9962035417556763, 'lxoice'),
 (0.9960581064224243, 'bazar'),
 (0.9956415891647339, 'invented'),
 (0.9956316351890564, 'replacements')]

In [None]:
# unsupervised training

# df.to_csv("food_receipes.txt", columns=["TranslatedInstructions"], header=None, index=False)
# model = fasttext.train_unsupervised("food_receipes.txt")

In [None]:
#