In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import important libraries

In [4]:
import pandas as pd

# Read link-category data

In [None]:
links = pd.read_csv('/content/drive/MyDrive/Query-Search-LLM-Model/link_category.csv')

In [None]:
links.head()

Unnamed: 0,links,labels
0,https://www.flannels.com/cp-company-stretch-sa...,trousers
1,https://www.flannels.com/cp-company-ergonomic-...,trousers
2,https://www.flannels.com/stone-island-raso-car...,trousers
3,https://www.flannels.com/cp-company-lens-joggi...,trousers
4,https://www.flannels.com/fear-of-god-essential...,trousers


# Label values

In [None]:
links.labels.value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
shoes,4031
jackets_coats,3688
hoodies_sweatshirts,3194
trousers,2109
polo_shirts,1965
shirts,1835
jeans,1084
knitwear,918
leather_jackets,59


In [None]:
links['links'][60]

'https://www.flannels.com/polo-ralph-lauren-cuffed-logo-tech-jogging-bottoms-482110#colcode=48211021'

In [None]:
links.shape

(18883, 2)

# Define scrapping insights class

In [3]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import requests
from PIL import Image
import os
import time
import threading
from io import BytesIO

In [None]:
class ScrappingInsights:
    def __init__(self, links, image_save_dir="/content/drive/MyDrive/Query-Search-LLM-Model/images"):
        self.links = links
        self.image_save_dir = image_save_dir
        os.makedirs(image_save_dir, exist_ok=True)

    def get_soup(self, url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup

    def get_title(self, soup):
        title = soup.find('span', attrs={'id': 'lblProductName'})
        return title.text if title else np.nan

    def get_brand(self, soup):
        brand = soup.find('span', attrs={'id': 'lblProductBrand'})
        return brand.text.strip() if brand else np.nan

    def get_color(self, soup):
        color = soup.find('span', attrs={'id': 'colourName'})
        return color.text.strip() if color else np.nan

    def get_description(self, soup):
        description = soup.find('div', attrs={'class': 'infoPageDescription DisplayAttributesActive'})
        return description.text.strip() if description else np.nan

    def get_image_matrices(self, soup, category, idx):
        image_paths = []
        image_link_element = soup.find('div', attrs={'class': 'innerImageContainer swiper-wrapper'})
        if image_link_element:
            links = image_link_element.find_all('img')
            for i, link in enumerate(links):
                image_link = link['src']
                response = requests.get(image_link)
                img = Image.open(BytesIO(response.content))

                image_path = os.path.join(self.image_save_dir, f"{category}_{idx}_image_{i}.png")
                img.save(image_path)
                image_paths.append(image_path)
            return image_paths
        else:
            return np.nan

    def get_insights(self, category, rows):
        results = []
        for _, row in rows.iterrows():
            url = row['links']
            soup = self.get_soup(url)

            if soup:
                title = self.get_title(soup)
                brand = self.get_brand(soup)
                color = self.get_color(soup)
                description = self.get_description(soup)
                images = self.get_image_matrices(soup, category, idx=row.name)

                results.append({
                    'links': url,
                    'title': title,
                    'brand': brand,
                    'color': color,
                    'description': description,
                    'images': images
                })
        return pd.DataFrame(results)

    def threaded_processing(self, category):
        batch_size = 10
        data = self.links[self.links['labels'] == category].reset_index(drop=True)
        num_batches = (data.shape[0] + batch_size - 1) // batch_size
        results = []

        def thread_function(batch, batch_num):
            batch_results = self.get_insights(category, batch)
            results.append(batch_results)
            print(f"Processed batch {batch_num + 1}/{num_batches}")

        threads = []
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, data.shape[0])
            batch = data.iloc[start_idx:end_idx]

            thread = threading.Thread(target=thread_function, args=(batch, i))
            threads.append(thread)
            thread.start()

            if (i + 1) % batch_size == 0 or i == num_batches - 1:
                for thread in threads:
                    thread.join()
                threads = []

        processed_data = pd.concat(results, ignore_index=True)
        return processed_data


# Create object of scarpping insights

In [None]:
scrapping = ScrappingInsights(links)

# knitwear data

In [None]:
knitwear_data = scrapping.threaded_processing('knitwear')

Processed batch 2/92
Processed batch 5/92
Processed batch 3/92
Processed batch 10/92
Processed batch 1/92
Processed batch 9/92
Processed batch 6/92
Processed batch 8/92
Processed batch 4/92
Processed batch 7/92
Processed batch 18/92
Processed batch 11/92
Processed batch 20/92
Processed batch 12/92
Processed batch 13/92
Processed batch 16/92
Processed batch 19/92
Processed batch 14/92
Processed batch 17/92
Processed batch 15/92
Processed batch 30/92
Processed batch 29/92
Processed batch 26/92
Processed batch 24/92
Processed batch 27/92
Processed batch 23/92
Processed batch 22/92
Processed batch 21/92
Processed batch 28/92
Processed batch 25/92
Processed batch 32/92
Processed batch 38/92
Processed batch 37/92
Processed batch 31/92
Processed batch 35/92
Processed batch 33/92
Processed batch 34/92
Processed batch 36/92
Processed batch 40/92
Processed batch 39/92
Processed batch 48/92
Processed batch 47/92
Processed batch 41/92
Processed batch 49/92
Processed batch 42/92
Processed batch 46/

In [None]:
knitwear_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/knitwear_data', index=False)

# shoes data

In [None]:
shoes_data = scrapping.threaded_processing('shoes')

Processed batch 6/404
Processed batch 3/404
Processed batch 5/404
Processed batch 1/404
Processed batch 8/404
Processed batch 2/404
Processed batch 4/404
Processed batch 7/404
Processed batch 10/404
Processed batch 9/404
Processed batch 13/404
Processed batch 15/404
Processed batch 18/404
Processed batch 20/404
Processed batch 14/404
Processed batch 19/404
Processed batch 16/404
Processed batch 12/404
Processed batch 11/404
Processed batch 17/404
Processed batch 23/404
Processed batch 28/404
Processed batch 26/404
Processed batch 22/404
Processed batch 27/404
Processed batch 30/404
Processed batch 29/404
Processed batch 25/404
Processed batch 21/404
Processed batch 24/404
Processed batch 37/404
Processed batch 36/404
Processed batch 35/404
Processed batch 32/404
Processed batch 39/404
Processed batch 31/404
Processed batch 40/404
Processed batch 38/404
Processed batch 33/404
Processed batch 34/404
Processed batch 47/404
Processed batch 45/404
Processed batch 48/404
Processed batch 42/4

In [None]:
shoes_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/shoes_data', index=False)

# jackets_coats data

In [None]:
jackets_coats_data = scrapping.threaded_processing('jackets_coats')

Processed batch 9/369
Processed batch 10/369
Processed batch 3/369
Processed batch 4/369
Processed batch 2/369
Processed batch 5/369
Processed batch 1/369
Processed batch 6/369
Processed batch 8/369
Processed batch 7/369
Processed batch 16/369
Processed batch 12/369
Processed batch 17/369
Processed batch 19/369
Processed batch 13/369
Processed batch 14/369
Processed batch 11/369
Processed batch 18/369
Processed batch 20/369
Processed batch 15/369
Processed batch 25/369
Processed batch 21/369
Processed batch 23/369
Processed batch 27/369
Processed batch 22/369
Processed batch 29/369
Processed batch 30/369
Processed batch 24/369
Processed batch 26/369
Processed batch 28/369
Processed batch 39/369
Processed batch 32/369
Processed batch 38/369
Processed batch 34/369
Processed batch 35/369
Processed batch 36/369
Processed batch 40/369
Processed batch 31/369
Processed batch 33/369
Processed batch 37/369
Processed batch 49/369
Processed batch 50/369
Processed batch 41/369
Processed batch 47/3

In [None]:
jackets_coats_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/jackets_coats_data', index=False)

# hoodies_sweatshirts data

In [None]:
hoodies_sweatshirts_data = scrapping.threaded_processing('hoodies_sweatshirts')

Processed batch 6/320
Processed batch 2/320
Processed batch 4/320
Processed batch 5/320
Processed batch 7/320
Processed batch 10/320
Processed batch 1/320
Processed batch 8/320
Processed batch 9/320
Processed batch 3/320
Processed batch 18/320
Processed batch 20/320
Processed batch 15/320
Processed batch 11/320
Processed batch 12/320
Processed batch 17/320
Processed batch 14/320
Processed batch 16/320
Processed batch 13/320
Processed batch 19/320
Processed batch 23/320
Processed batch 24/320
Processed batch 27/320
Processed batch 21/320
Processed batch 30/320
Processed batch 22/320
Processed batch 28/320
Processed batch 25/320
Processed batch 29/320
Processed batch 26/320
Processed batch 34/320
Processed batch 37/320
Processed batch 31/320
Processed batch 40/320
Processed batch 33/320
Processed batch 38/320
Processed batch 36/320
Processed batch 39/320
Processed batch 32/320
Processed batch 35/320
Processed batch 43/320
Processed batch 45/320
Processed batch 42/320
Processed batch 44/3

In [None]:
hoodies_sweatshirts_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/hoodies_sweatshirts_data', index=False)

# Trousers data

In [None]:
trousers_data = scrapping.threaded_processing('trousers')

Processed batch 9/211
Processed batch 2/211
Processed batch 3/211
Processed batch 1/211
Processed batch 4/211
Processed batch 5/211
Processed batch 10/211
Processed batch 8/211
Processed batch 7/211
Processed batch 6/211
Processed batch 14/211
Processed batch 18/211
Processed batch 11/211
Processed batch 13/211
Processed batch 12/211
Processed batch 16/211
Processed batch 19/211
Processed batch 17/211
Processed batch 15/211
Processed batch 20/211
Processed batch 28/211
Processed batch 26/211
Processed batch 23/211
Processed batch 22/211
Processed batch 25/211
Processed batch 30/211
Processed batch 21/211
Processed batch 29/211
Processed batch 27/211
Processed batch 24/211
Processed batch 32/211
Processed batch 35/211
Processed batch 40/211
Processed batch 34/211
Processed batch 37/211
Processed batch 38/211
Processed batch 31/211
Processed batch 33/211
Processed batch 36/211
Processed batch 39/211
Processed batch 41/211
Processed batch 44/211
Processed batch 49/211
Processed batch 42/2

In [None]:
trousers_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/trousers_data', index=False)

# polo_shirts data

In [None]:
polo_shirts_data = scrapping.threaded_processing('polo_shirts')

Processed batch 10/197
Processed batch 1/197
Processed batch 4/197
Processed batch 9/197
Processed batch 2/197
Processed batch 5/197
Processed batch 6/197
Processed batch 3/197
Processed batch 8/197
Processed batch 7/197
Processed batch 11/197
Processed batch 13/197
Processed batch 17/197
Processed batch 12/197
Processed batch 15/197
Processed batch 19/197
Processed batch 16/197
Processed batch 20/197
Processed batch 14/197
Processed batch 18/197
Processed batch 24/197
Processed batch 28/197
Processed batch 29/197
Processed batch 21/197
Processed batch 22/197
Processed batch 26/197
Processed batch 27/197
Processed batch 23/197
Processed batch 25/197
Processed batch 30/197
Processed batch 37/197
Processed batch 31/197
Processed batch 36/197
Processed batch 34/197
Processed batch 39/197
Processed batch 32/197
Processed batch 35/197
Processed batch 40/197
Processed batch 38/197
Processed batch 33/197
Processed batch 44/197
Processed batch 43/197
Processed batch 48/197
Processed batch 49/1

In [None]:
polo_shirts_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/polo_shirts_data', index=False)

# shirts data

In [None]:
shirts_data = scrapping.threaded_processing('shirts')

Processed batch 1/184
Processed batch 2/184
Processed batch 7/184
Processed batch 9/184
Processed batch 6/184
Processed batch 3/184
Processed batch 4/184
Processed batch 8/184
Processed batch 10/184
Processed batch 5/184
Processed batch 19/184
Processed batch 12/184
Processed batch 18/184
Processed batch 17/184
Processed batch 13/184
Processed batch 11/184
Processed batch 16/184
Processed batch 14/184
Processed batch 20/184
Processed batch 15/184
Processed batch 27/184
Processed batch 25/184
Processed batch 24/184
Processed batch 28/184
Processed batch 30/184
Processed batch 23/184
Processed batch 21/184
Processed batch 22/184
Processed batch 29/184
Processed batch 26/184
Processed batch 37/184
Processed batch 38/184
Processed batch 33/184
Processed batch 40/184
Processed batch 34/184
Processed batch 35/184
Processed batch 31/184
Processed batch 32/184
Processed batch 36/184
Processed batch 39/184
Processed batch 50/184
Processed batch 45/184
Processed batch 48/184
Processed batch 42/1

In [None]:
shirts_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/shirts_data', index=False)

# Jeans data

In [None]:
jeans_data = scrapping.threaded_processing('jeans')

Processed batch 7/109
Processed batch 10/109
Processed batch 9/109
Processed batch 8/109
Processed batch 1/109
Processed batch 6/109
Processed batch 3/109
Processed batch 2/109
Processed batch 5/109
Processed batch 4/109
Processed batch 13/109
Processed batch 11/109
Processed batch 14/109
Processed batch 20/109
Processed batch 17/109
Processed batch 15/109
Processed batch 16/109
Processed batch 19/109
Processed batch 18/109
Processed batch 12/109
Processed batch 27/109
Processed batch 22/109
Processed batch 21/109
Processed batch 30/109
Processed batch 28/109
Processed batch 25/109
Processed batch 29/109
Processed batch 24/109
Processed batch 23/109
Processed batch 26/109
Processed batch 35/109
Processed batch 40/109
Processed batch 36/109
Processed batch 31/109
Processed batch 38/109
Processed batch 34/109
Processed batch 37/109
Processed batch 33/109
Processed batch 32/109
Processed batch 39/109
Processed batch 46/109
Processed batch 41/109
Processed batch 43/109
Processed batch 42/1

In [None]:
jeans_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/jeans_data', index=False)

# Leather_jackets data

In [None]:
leather_jackets_data = scrapping.threaded_processing('leather_jackets')

Processed batch 6/6
Processed batch 2/6
Processed batch 5/6
Processed batch 1/6
Processed batch 3/6
Processed batch 4/6


In [None]:
leather_jackets_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/leather_jackets_data', index=False)

# Incorporate all data into one

In [5]:
knitwear_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/knitwear_data')
shoes_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/shoes_data')
jackets_coats_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/jackets_coats_data')
hoodies_sweatshirts_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/hoodies_sweatshirts_data')
trousers_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/trousers_data')
polo_shirts_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/polo_shirts_data')
shirts_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/shirts_data')
jeans_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/jeans_data')
leather_jackets_data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/leather_jackets_data')

In [9]:
knitwear_data['category'] = 'knitwear'
shoes_data['category'] = 'shoes'
jackets_coats_data['category'] = 'jackets_coats'
hoodies_sweatshirts_data['category'] = 'hoodies_sweatshirts'
trousers_data['category'] = 'trousers'
polo_shirts_data['category'] = 'polo_shirts'
shirts_data['category'] = 'shirts'
jeans_data['category'] = 'jeans'
leather_jackets_data['category'] = 'leather_jackets'

In [11]:
final_data = pd.concat([knitwear_data, shoes_data, jackets_coats_data, hoodies_sweatshirts_data, trousers_data, polo_shirts_data, shirts_data, jeans_data, leather_jackets_data], ignore_index=True)

In [12]:
final_data.shape

(18883, 7)

In [13]:
final_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/all_data', index=False)