In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import important libraries

In [4]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/all_data')

In [4]:
data.head()

Unnamed: 0,links,title,brand,color,description,images,category
0,https://www.flannels.com/ami-paris-ami-de-coeu...,Ami De Coeur Wool Jumper,AMI PARIS,BLUE 4843,Add a touch of Parisian sophistication to your...,[/content/drive/MyDrive/Query-Search-LLM-Model...,knitwear
1,https://www.flannels.com/polo-ralph-lauren-pol...,Polo Knit Cardigan Sn41,Polo Ralph Lauren,Navy,,[/content/drive/MyDrive/Query-Search-LLM-Model...,knitwear
2,https://www.flannels.com/boss-boss-bono-l-1024...,Boss Bono-L 10243945 01,Boss,Black 001,,[/content/drive/MyDrive/Query-Search-LLM-Model...,knitwear
3,https://www.flannels.com/boss-ritom-half-zip-k...,Ritom Half Zip Knit Top,Boss,Blue 402,Boss Ritom Half Zip Knit Top The Boss Ritom Ha...,[/content/drive/MyDrive/Query-Search-LLM-Model...,knitwear
4,https://www.flannels.com/stone-island-ghost-cr...,Ghost Crew Knit Sweatshirt,STONE ISLAND,Nero V0029,Add an on-trend element to your everyday essen...,[/content/drive/MyDrive/Query-Search-LLM-Model...,knitwear


# Class for data cleaning

In [None]:
class DataCleaning:
    def __init__(self, data):
        self.data = data
        self.indices_1 = data[data.description == ''].index
        self.indices_2 = data[data.images.isna()].index
        self.indices_3 = data[data.description.str.len() < 100].index

    def clean_data(self):
        """Deleteing rows which do not have description or images and which has description less than 100 letter."""
        indices = self.indices_1.union(self.indices_2)
        indices = indices.union(self.indices_3)
        self.data = self.data.drop(indices)
        return self.data

    def incorporate(self):
        self.data['Description'] = self.data.apply(
            lambda row: f"Title of the product is {row.title}. Brand of the product is {row.brand}. Color of the product is {row.color}. Category of the product is {row.category}. Description of the product is: {row.description}.",
            axis=1
        )
        return self.data
    def get_clean_data(self):
        self.data = self.clean_data()
        self.data = self.incorporate()
        return self.data[['links', 'Description', 'images']]

In [None]:
cleaning = DataCleaning(data)

In [None]:
data = cleaning.get_clean_data()

In [None]:
data['Description'][19]

'Title of the product is Half Zip Sweater With Buttons. Brand of the product is STONE ISLAND. Color of the product is Noce V0075. Category of the product is knitwear. Description of the product is: Step out in style with the Half Zip Sweater with Buttons by Stone Island, the perfect choice for both days in the city and drinks with friends. Effortlessly pairing with any ensemble or wardrobe staples, this classic jumper exudes timeless style while keeping you warm and cosy in colder weather thanks to its lambswool construction. Product Highlights: - Half zip fastening with button placket: allows you to seamlessly adjust the style in seconds. - Stone Island badge: proudly adorns the left sleeve for a stamp of iconic style and luxury exclusivity. - Composition: 80% wool, 20% polyamide. - Care: hand wash..'

In [None]:
data['images'][19]

array(['/content/drive/MyDrive/Query-Search-LLM-Model/images/knitwear_49_image_0.png',
       '/content/drive/MyDrive/Query-Search-LLM-Model/images/knitwear_49_image_1.png',
       '/content/drive/MyDrive/Query-Search-LLM-Model/images/knitwear_49_image_2.png',
       '/content/drive/MyDrive/Query-Search-LLM-Model/images/knitwear_49_image_3.png'],
      dtype=object)

In [None]:
data.columns

Index(['links', 'Description', 'images'], dtype='object')

# Save data

In [None]:
data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/clean_data')

# In this data Queries are created base on description and main aim is to separate queries and images to create individual rows.

In [6]:
data = pd.read_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/clean_data')

In [7]:
data

Unnamed: 0,links,Description,images,Queries
0,https://www.flannels.com/ami-paris-ami-de-coeu...,Title of the product is Ami De Coeur Wool Jump...,[/content/drive/MyDrive/Query-Search-LLM-Model...,"Ami De Coeur Wool Jumper in blue, looking for ..."
1,https://www.flannels.com/boss-ritom-half-zip-k...,Title of the product is Ritom Half Zip Knit To...,[/content/drive/MyDrive/Query-Search-LLM-Model...,"Looking for a comfortable blue knit top, maybe..."
2,https://www.flannels.com/stone-island-ghost-cr...,Title of the product is Ghost Crew Knit Sweats...,[/content/drive/MyDrive/Query-Search-LLM-Model...,I'm looking for a stylish black sweatshirt for...
3,https://www.flannels.com/stone-island-lambswoo...,Title of the product is Lambswool Crew Neck Kn...,[/content/drive/MyDrive/Query-Search-LLM-Model...,I'm looking for a classic crew neck jumper in ...
4,https://www.flannels.com/boss-hugo-boss-pascas...,Title of the product is Hugo Boss Pascas Knit ...,[/content/drive/MyDrive/Query-Search-LLM-Model...,Looking for a black Hugo Boss jumper with a su...
...,...,...,...,...
11696,https://www.flannels.com/valentino-hooded-leat...,Title of the product is Hooded Leather Jacket....,[/content/drive/MyDrive/Query-Search-LLM-Model...,Looking for a black Valentino leather jacket w...
11697,https://www.flannels.com/ami-paris-leather-jac...,Title of the product is Leather Jacket. Brand ...,[/content/drive/MyDrive/Query-Search-LLM-Model...,"Looking for a black leather AMI PARIS jacket, ..."
11698,https://www.flannels.com/gucci-gucci-wool-lthr...,Title of the product is Gucci Wool Lthr Jkt Sn...,[/content/drive/MyDrive/Query-Search-LLM-Model...,"Gucci camel suede bomber jacket, wool details$..."
11699,https://www.flannels.com/alexander-mcqueen-lea...,Title of the product is Leather Biker Jacket. ...,[/content/drive/MyDrive/Query-Search-LLM-Model...,Looking for a black Alexander McQueen leather ...


In [8]:
import re

# Class which will take one row consisting 5 queries and images and converi into 5 different rows.

In [82]:
class CreateIndividualRows:
    def __init__(self, data):
        self.data = data
        self.processed_data = pd.DataFrame(columns=['links', 'Queries', 'images'])

    def process_individual_row(self, row):
        Queries = row.Queries.replace('\n', '').strip()
        if Queries.count('$') > 4:
          index = Queries.rindex('$')
          Queries = Queries[:index]

        Queries = re.sub(r'[^a-zA-Z0-9$!? \-]', '', Queries)

        new_Queries = Queries.split('$')
        if len(new_Queries) > 5:
          new_Queries = new_Queries[:5]

        images = row.images
        links = row.links
        new_links = [links] * 5

        if len(images) < 5 or len(images) > 5:
            new_images = np.random.choice(images, 5, replace=True)
        else:
            new_images = images

        if len(new_Queries) < 5:
            new_Queries = new_Queries + list(np.random.choice(new_Queries, 5 - len(new_Queries), replace=True))
        elif len(new_Queries) > 5:
            new_Queries = new_Queries[:5]
        else:
            new_Queries = new_Queries
        row_df = pd.DataFrame({
            'links': new_links,
            'Queries': new_Queries,
            'images': new_images
        })
        return row_df

    def process_data(self):
        for index, row in self.data.iterrows():
            rows = self.process_individual_row(row)
            self.processed_data = pd.concat([self.processed_data, rows], ignore_index=True)
            print(f'Processed row {index + 1}/{self.data.shape[0]}')
        self.processed_data = self.processed_data.reset_index(drop=True)
        self.processed_data = self.processed_data.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

In [83]:
individual_rows = CreateIndividualRows(data)

In [84]:
individual_rows.process_data()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed row 6702/11701
Processed row 6703/11701
Processed row 6704/11701
Processed row 6705/11701
Processed row 6706/11701
Processed row 6707/11701
Processed row 6708/11701
Processed row 6709/11701
Processed row 6710/11701
Processed row 6711/11701
Processed row 6712/11701
Processed row 6713/11701
Processed row 6714/11701
Processed row 6715/11701
Processed row 6716/11701
Processed row 6717/11701
Processed row 6718/11701
Processed row 6719/11701
Processed row 6720/11701
Processed row 6721/11701
Processed row 6722/11701
Processed row 6723/11701
Processed row 6724/11701
Processed row 6725/11701
Processed row 6726/11701
Processed row 6727/11701
Processed row 6728/11701
Processed row 6729/11701
Processed row 6730/11701
Processed row 6731/11701
Processed row 6732/11701
Processed row 6733/11701
Processed row 6734/11701
Processed row 6735/11701
Processed row 6736/11701
Processed row 6737/11701
Processed row 6738/11701
Processed 

In [85]:
processed_data = individual_rows.processed_data

# Save processed data

In [100]:
processed_data.to_parquet('/content/drive/MyDrive/Query-Search-LLM-Model/individual_rows')