# NER model project

In this project I tried to see how the products extracted from the websites can be clustered 

## Creating the model

In [1]:
import pandas as pd
import requests
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
# Load the CSV into a pandas DataFrame
df = pd.read_csv('furniture stores pages.csv')

In [3]:
# Load the NER model in spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
texts = []
products = []
for website in df['website'][:100]:
    try:
        response = requests.get(website)
        text = response.text
        texts.append(text)
        
        doc = nlp(text)
        
        product_list = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
        products.append(", ".join(product_list))
    except:
        texts.append(None)
        products.append(None)

In [5]:
# Add the extracted text and products back to the DataFrame
df['text'] = pd.Series(texts)
df['products'] = pd.Series(products)

In [6]:
# Drop rows with missing values
df = df.dropna()

In [7]:
# Create a TfidfVectorizer
vectorizer = TfidfVectorizer()

In [8]:
# Fit the vectorizer on the products
X = vectorizer.fit_transform(df['products'])

In [9]:
# Train a KMeans clustering model on the TF-IDF features
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

KMeans(n_clusters=5)

In [10]:
# Assign each product to a cluster
df['cluster'] = kmeans.predict(X)

In [11]:
# Display the number of products in each cluster
print(df['cluster'].value_counts())

0    25
1    23
2    17
4    12
3     4
Name: cluster, dtype: int64


## New values

In [12]:
# Extract the products from some new, unseen pages (random unused websites from the excel)
new_texts = []
new_products = []
new_websites = ['https://decorum-shop.co.uk/products/gift-card-10-25-50-100','https://lostine.com/products/jack-chairs','https://designkollective.com/stores/woodlandcreek/products/76623']

for website in new_websites:
    try:
        response = requests.get(website)
        text = response.text
        new_texts.append(text)
        
        doc = nlp(text)
        
        product_list = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
        new_products.append(', '.join(product_list))
    except:
        new_texts.append(None)
        new_products.append(None)

In [13]:
# Create a new DataFrame with the extracted products from the new pages
new_df = pd.DataFrame({'website': new_websites, 'products': new_products})

In [14]:
# Drop rows with missing values
new_df = new_df.dropna()

In [15]:
# Transform the new products using the TfidfVectorizer
new_X = vectorizer.transform(new_df['products'])

In [16]:
# Assign each new product to a cluster
new_df['cluster'] = kmeans.predict(new_X)

In [17]:
# Display the number of new products in each cluster
print(new_df['cluster'].value_counts())

2    1
3    1
Name: cluster, dtype: int64
