### Authenticate

If you are using Colab, you will need to authenticate yourself first. The next cell will check if you are currently using Colab, and will start the authentication process.

In [None]:
import sys
if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()

In [None]:
!which python

/usr/local/bin/python


In [None]:
!python --version

Python 3.10.12


## Installation & Configurations

In [None]:
!pip install google-cloud-storage



In [None]:
!python -m pip install openpyxl



# Dataset

[This](https://www.kaggle.com/datasets/PromptCloudHQ/flipkart-products) is a pre-crawled dataset, taken as subset of a bigger dataset (more than 5.8 million products) that was created by extracting data from [Flipkart](https://www.flipkart.com/), a leading Indian eCommerce store.


In [None]:
import pandas as pd
full_ds = pd.read_csv('gs://gke-dataprocessing-mvp-karajendran/flipkart_com-ecommerce_sample.csv')

In [None]:
full_ds.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [None]:
full_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   uniq_id                  20000 non-null  object 
 1   crawl_timestamp          20000 non-null  object 
 2   product_url              20000 non-null  object 
 3   product_name             20000 non-null  object 
 4   product_category_tree    20000 non-null  object 
 5   pid                      20000 non-null  object 
 6   retail_price             19922 non-null  float64
 7   discounted_price         19922 non-null  float64
 8   image                    19997 non-null  object 
 9   is_FK_Advantage_product  20000 non-null  bool   
 10  description              19998 non-null  object 
 11  product_rating           20000 non-null  object 
 12  overall_rating           20000 non-null  object 
 13  brand                    14136 non-null  object 
 14  product_specifications

In [None]:
df = full_ds[['uniq_id','product_name','description','brand','product_category_tree','product_specifications','image']]

In [None]:
# check the values of each row for each column
n = df.nunique(axis=0)
print("No.of.unique values in each column : \n", n)

No.of.unique values in each column : 
 uniq_id                   20000
product_name              12676
description               17539
brand                      3499
product_category_tree      6466
product_specifications    18825
image                     18589
dtype: int64


In [None]:
pd.options.display.max_rows
#pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [None]:
df.head()

Unnamed: 0,uniq_id,product_name,description,brand,product_category_tree,product_specifications,image
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"[""Clothing >> Women's Clothing >> Lingerie, Sl...","{""product_specification""=>[{""key""=>""Number of ...","[""http://img5a.flixcart.com/image/short/u/4/a/..."
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"[""Furniture >> Living Room Furniture >> Sofa B...","{""product_specification""=>[{""key""=>""Installati...","[""http://img6a.flixcart.com/image/sofa-bed/j/f..."
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,AW,"[""Footwear >> Women's Footwear >> Ballerinas >...","{""product_specification""=>[{""key""=>""Ideal For""...","[""http://img5a.flixcart.com/image/shoe/7/z/z/r..."
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"[""Clothing >> Women's Clothing >> Lingerie, Sl...","{""product_specification""=>[{""key""=>""Number of ...","[""http://img5a.flixcart.com/image/short/6/2/h/..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,Sicons,"[""Pet Supplies >> Grooming >> Skin & Coat Care...","{""product_specification""=>[{""key""=>""Pet Type"",...","[""http://img5a.flixcart.com/image/pet-shampoo/..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uniq_id                 20000 non-null  object
 1   product_name            20000 non-null  object
 2   description             19998 non-null  object
 3   brand                   14136 non-null  object
 4   product_category_tree   20000 non-null  object
 5   product_specifications  19986 non-null  object
 6   image                   19997 non-null  object
dtypes: object(7)
memory usage: 1.1+ MB


# Category Analysis

In [None]:
#Helper function to reformat the given text
def reformat(text: str) -> str:
  text = text.replace('[', '')
  text = text.replace(']', '')
  text = text.replace('"', '')
  return text

#df.loc[:, 'product_category_tree'] = df['product_category_tree'].apply(lambda x: reformat(x))
df['product_category_tree'] = df['product_category_tree'].apply(lambda x: reformat(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product_category_tree'] = df['product_category_tree'].apply(lambda x: reformat(x))


In [None]:
# Finding the depth of the category trees
# Finding total number of categories in each level
cat_len = {}
for cat_tree in df.product_category_tree:
  number_of_categories = len(cat_tree.split(">>"))
  #print(number_of_categories)
  if number_of_categories not in cat_len:
    cat_len[number_of_categories] = 1
  else:
    cat_len[number_of_categories] += 1
print(cat_len)

{6: 3640, 4: 4765, 5: 4911, 1: 328, 3: 4419, 7: 778, 2: 1129, 8: 30}


**There are total 8 levels at max.**

In [None]:
temp_df = df['product_category_tree'].str.split('>>', expand=True)
temp_df.columns = ['c0_name', 'c1_name', 'c2_name', 'c3_name', 'c4_name', 'c5_name', 'c6_name', 'c7_name']
for col in temp_df.columns:
  temp_df[col] = temp_df[col].apply(lambda x: x.strip() if x else x)

**Considering only 4 levels from category tree**

In [None]:
#Considering only 4 levels from category tree
temp_df =temp_df[['c0_name', 'c1_name', 'c2_name', 'c3_name']]
temp_df

Unnamed: 0,c0_name,c1_name,c2_name,c3_name
0,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
1,Furniture,Living Room Furniture,Sofa Beds & Futons,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,Footwear,Women's Footwear,Ballerinas,AW Bellies
3,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
4,Pet Supplies,Grooming,Skin & Coat Care,Shampoo
...,...,...,...,...
19995,Baby Care,Baby & Kids Gifts,Stickers,WallDesign Stickers
19996,Baby Care,Baby & Kids Gifts,Stickers,Wallmantra Stickers
19997,Baby Care,Baby & Kids Gifts,Stickers,Elite Collection Stickers
19998,Baby Care,Baby & Kids Gifts,Stickers,Elite Collection Stickers


In [None]:
# concatenating df1 and df2 along rows
df_with_cat = pd.concat([df, temp_df], axis=1)
df_with_cat = df_with_cat.drop('product_category_tree', axis=1)

In [None]:
df_with_cat.head()

Unnamed: 0,uniq_id,product_name,description,brand,product_specifications,image,c0_name,c1_name,c2_name,c3_name
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ...","[""http://img5a.flixcart.com/image/short/u/4/a/...",Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...","[""http://img6a.flixcart.com/image/sofa-bed/j/f...",Furniture,Living Room Furniture,Sofa Beds & Futons,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,AW,"{""product_specification""=>[{""key""=>""Ideal For""...","[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",Footwear,Women's Footwear,Ballerinas,AW Bellies
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ...","[""http://img5a.flixcart.com/image/short/6/2/h/...",Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...","[""http://img5a.flixcart.com/image/pet-shampoo/...",Pet Supplies,Grooming,Skin & Coat Care,Shampoo


In [None]:
#Saving the categories into an xlsx on local
columns = temp_df.columns
with pd.ExcelWriter('flipkart_cat_analysis_cat_depth4.xlsx') as writer:
  for col in columns:
    temp_df[col].value_counts().to_excel(writer, sheet_name=col)

In [None]:
df_with_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uniq_id                 20000 non-null  object
 1   product_name            20000 non-null  object
 2   description             19998 non-null  object
 3   brand                   14136 non-null  object
 4   product_specifications  19986 non-null  object
 5   image                   19997 non-null  object
 6   c0_name                 20000 non-null  object
 7   c1_name                 19672 non-null  object
 8   c2_name                 18543 non-null  object
 9   c3_name                 14124 non-null  object
dtypes: object(10)
memory usage: 1.5+ MB


In [None]:
#Checking for categories/sub-categories repetition
#non_null_image_df.reset_index(drop=True, inplace=True)
col1 = df_with_cat['c0_name']
col2 = df_with_cat['c1_name']
col3 = df_with_cat['c2_name']
col4 = df_with_cat['c3_name']

In [None]:
'''
Categoty Tree [depth 4]:
root -> child -> sub-child -> leaf
'''

duplicate_index = []
for i in range(0,len(col1)):
    if (col1[i] == col2[i] and col1[i] and col2[i]):
      print('category repeating: root & child is same')
      print(i)
      print(col1[i],col2[i], col3[i], col4[i])
    if (col2[i] == col3[i] and col2[i] and col3[i]):
      print('category repeating: child & sub-child is same')
      print(i)
      print(col1[i],col2[i], col3[i], col4[i])
    if (col3[i] == col4[i] and col3[i] and col4[i]):
      print('category repeating:  sub-child & leaf is same')
      print(i)
      print(col1[i],"'",col2[i], ",", col3[i], ",", col4[i])
    if (col1[i] == col3[i] and col1[i] and col3[i]):
      print('category repeating: root & sub-child is same')
      print(i)
    if (col1[i] == col4[i] and col1[i] and col4[i]):
      print('category repeating: root & leaf is same')
      print(i)
    if (col2[i] == col4[i] and col2[i] and col4[i]):
      print('category repeating: child & leaf is same')
      print(i)

category repeating:  sub-child & leaf is same
1681
Automotive ' Accessories & Spare parts , Tyres , Tyres
category repeating:  sub-child & leaf is same
10086
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
11241
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
11252
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
14921
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
15062
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
15063
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-child & leaf is same
15091
Clothing ' Women's Clothing , Leggings & Jeggings , Leggings & Jeggings
category repeating:  sub-c

**Some of the sub-child & leaf are matching. We should remove the duplicate category**

*Please check the index from above result and update below list accordingly, before running this cell*

*This approach is to make leaf categories as Null*

In [None]:
#please check the index and update below list, before running this cell
duplicate_index = [1681, 10086, 11241, 11252, 14921, 15062, 15063, 15091, 15468, 17591, 18809]
for i in duplicate_index:
  df_with_cat['c3_name'][i] = None

# Extracting Product Attributes

In [None]:
#Extracting attributes from product specifications
import json
from typing import List, Dict

import jsonpickle
import pandas as pd
import re

import numpy as np
SPEC_MATCH_ONE = re.compile("(.*?)\\[(.*)\\](.*)")
SPEC_MATCH_TWO = re.compile("(.*?)=>\"(.*?)\"(.*?)=>\"(.*?)\"(.*)")

def parse_spec(specification: str):
    if pd.isna(specification):
      return None
    m = SPEC_MATCH_ONE.match(specification)
    out = {}
    position = 0
    if m is not None and m.group(2) is not None:
        phrase = ''
        for c in m.group(2):
            if c == '}':
                m2 = SPEC_MATCH_TWO.match(phrase)
                if m2 and m2.group(2) is not None and m2.group(4) is not None:
                    out[m2.group(2)]=m2.group(4)
                phrase = ''
            else:
                phrase += c
    json_string = jsonpickle.encode(out)
    print(json_string)
    return json_string

In [None]:
!pip3 show jsonpickle

Name: jsonpickle
Version: 3.0.4
Summary: Serialize any Python object to JSON
Home-page: https://github.com/jsonpickle/jsonpickle
Author: David Aguilar
Author-email: davvid@gmail.com
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: music21


In [None]:
df_with_cat['attributes'] = df_with_cat['product_specifications'].apply(parse_spec)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{"Occasion": "Casual", "Ideal For": "Women", "Type": "Flats", "Heel Height": "1 inch", "Outer Material": "Synthetic Leather", "Color": "20,Beige"}
{"Number of Contents in Sales Package": "Pack of 5", "Brand Fit": "Slim", "Fabric": "Cotton", "Ideal For": "Men's", "Style Code": "RAC-5OFCOMBO-10"}
{"Material": "Cartoon", "Brand": "Love Baby", "Type": "Set of Towels", "Model Name": "Baby Bath Towel", "Ideal For": "Boys, Girls", "Model ID": "1907", "Color": "Blue", "Length": "60.9 cm", "Width": "91.4 cm", "Sales Package": "Bath Towel"}
{"Number of Contents in Sales Package": "Pack of 1", "Fabric": "Cotton Lycra", "Wash": "Other", "Rise": "Mid Rise", "Occasion": "Casual", "Ideal For": "Men's", "Style Code": "Momento DLBL"}
{"Closure": "Button", "Number of Contents in Sales Package": "Pack of 1", "Brand Fit": "Slim", "Fabric": "Cotton", "Rise": "Mid Rise", "Wash": "Stone Wash", "Fly": "Zipper", "Pattern": "Solid", "Ideal For": "

# Preparing Fine Tuning Dataset

In [None]:
df_with_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uniq_id                 20000 non-null  object
 1   product_name            20000 non-null  object
 2   description             19998 non-null  object
 3   brand                   14136 non-null  object
 4   product_specifications  19986 non-null  object
 5   image                   19997 non-null  object
 6   c0_name                 20000 non-null  object
 7   c1_name                 19672 non-null  object
 8   c2_name                 18543 non-null  object
 9   c3_name                 14113 non-null  object
 10  attributes              19986 non-null  object
dtypes: object(11)
memory usage: 1.7+ MB


In [None]:
# Drop duplicate column product_specifications
df_with_cat.drop('product_specifications', axis=1, inplace=True)

In [None]:
df_with_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uniq_id       20000 non-null  object
 1   product_name  20000 non-null  object
 2   description   19998 non-null  object
 3   brand         14136 non-null  object
 4   image         19997 non-null  object
 5   c0_name       20000 non-null  object
 6   c1_name       19672 non-null  object
 7   c2_name       18543 non-null  object
 8   c3_name       14113 non-null  object
 9   attributes    19986 non-null  object
dtypes: object(10)
memory usage: 1.5+ MB


In [None]:
#renaming column name
df_with_cat.rename(columns={'uniq_id':'Id','product_name':'Name', 'description':'Description', 'brand':'Brand','attributes':'Specifications'}, inplace=True)

In [None]:
df_with_cat.head()

Unnamed: 0,Id,Name,Description,Brand,image,c0_name,c1_name,c2_name,c3_name,Specifications
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"[""http://img5a.flixcart.com/image/short/u/4/a/...",Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts,"{""Number of Contents in Sales Package"": ""Pack ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",Furniture,Living Room Furniture,Sofa Beds & Futons,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,"{""Installation & Demo Details"": ""Installation ..."
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,AW,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",Footwear,Women's Footwear,Ballerinas,AW Bellies,"{""Ideal For"": ""Women"", ""Occasion"": ""Casual"", ""..."
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"[""http://img5a.flixcart.com/image/short/6/2/h/...",Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts,"{""Number of Contents in Sales Package"": ""Pack ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,Sicons,"[""http://img5a.flixcart.com/image/pet-shampoo/...",Pet Supplies,Grooming,Skin & Coat Care,Shampoo,"{""Pet Type"": ""Dog"", ""Brand"": ""Sicons"", ""Quanti..."


In [None]:
df_with_cat.c0_name.value_counts()

c0_name
Clothing                                               6198
Jewellery                                              3531
Footwear                                               1227
Mobiles & Accessories                                  1099
Automotive                                             1012
Home Decor & Festive Needs                              929
Beauty and Personal Care                                710
Home Furnishing                                         700
Kitchen & Dining                                        647
Computers                                               578
Watches                                                 530
Baby Care                                               483
Tools & Hardware                                        391
Toys & School Supplies                                  330
Pens & Stationery                                       313
Bags, Wallets & Belts                                   265
Furniture                       

In [None]:
filtered_df = df_with_cat[df_with_cat['c0_name'] == 'Clothing']

In [None]:
filtered_df.c1_name.value_counts()

c1_name
Women's Clothing    3901
Men's Clothing      1773
Kids' Clothing       520
fourgee Clothing       1
piftif Clothing        1
Clovia Clothing        1
Sonpra Clothing        1
Name: count, dtype: int64

In [None]:
values_to_filter = ["Women's Clothing", "Men's Clothing","Kids' Clothing"]
clothing_filtered_df = filtered_df[filtered_df['c1_name'].isin(values_to_filter)]

In [None]:
clothing_filtered_df.c2_name.value_counts()

c2_name
Western Wear                     1981
Lingerie, Sleep & Swimwear       1208
T-Shirts                          903
Ethnic Wear                       485
Girls Wear                        287
Shirts                            234
Winter & Seasonal Wear            225
Boys Wear                         169
Accessories & Combo Sets          135
Inner Wear & Sleep Wear            75
Fusion Wear                        73
Jeans                              65
Infants Wear                       63
Sports & Gym Wear                  49
Trousers                           35
Navaksha Men's Clothing            32
Suits & Blazers                    32
Sports Wear                        26
Leggings & Jeggings                22
Cargos, Shorts & 3/4ths            21
Maternity Wear                     21
Formal Wear                        19
Accessories                        17
Combo Sets                          5
Fabrics                             2
Clovia Women's Clothing             1
TIMB

In [None]:
clothing_filtered_df.c3_name.value_counts()

c3_name
Shirts, Tops & Tunics                              1249
Bras                                               1036
Dresses & Skirts                                    588
Kurtas & Kurtis                                     202
Leggings & Jeggings                                 194
Numero Uno T-Shirts                                 135
Formal Shirts                                       128
Fabric                                              122
Casual & Party Wear Shirts                          106
Sweatshirts                                          92
Oviyon T-Shirts                                      87
Ethnic Wear                                          83
Okane T-Shirts                                       82
Sweaters                                             71
Ties                                                 70
Nimya T-Shirts                                       50
Baby Boys                                            49
Northern Lights T-Shirts                

In [None]:
import pandas as pd

def filter_low_value_count_rows(df, column_name, min_count=10):
    """
    Removes rows from a DataFrame where the value count in the specified column is less than the given minimum count.

    Args:
        df: The Pandas DataFrame to filter.
        column_name: The name of the column to check value counts for.
        min_count: The minimum value count required for a row to be kept (default: 10).

    Returns:
        A new DataFrame with rows removed where value counts are below the threshold.
    """

    # Calculate value counts for the specified column
    value_counts = df[column_name].value_counts()

    # Filter values that meet the minimum count criteria
    filtered_values = value_counts[value_counts >= min_count].index

    # Create a new DataFrame keeping only rows with those values
    filtered_df = df[df[column_name].isin(filtered_values)]

    return filtered_df

# Filter to keep rows where 'c2_name' has count >=10
c2_filtered_df = filter_low_value_count_rows(clothing_filtered_df, 'c2_name', min_count=10)
#print(c2_filtered_df)


In [None]:
c2_filtered_df.c2_name.value_counts()

c2_name
Western Wear                  1981
Lingerie, Sleep & Swimwear    1208
T-Shirts                       903
Ethnic Wear                    485
Girls Wear                     287
Shirts                         234
Winter & Seasonal Wear         225
Boys Wear                      169
Accessories & Combo Sets       135
Inner Wear & Sleep Wear         75
Fusion Wear                     73
Jeans                           65
Infants Wear                    63
Sports & Gym Wear               49
Trousers                        35
Navaksha Men's Clothing         32
Suits & Blazers                 32
Sports Wear                     26
Leggings & Jeggings             22
Cargos, Shorts & 3/4ths         21
Maternity Wear                  21
Formal Wear                     19
Accessories                     17
Name: count, dtype: int64

In [None]:
c3_filtered_df = filter_low_value_count_rows(clothing_filtered_df, 'c3_name', min_count=10)

In [None]:
c3_filtered_df.c3_name.value_counts()

c3_name
Shirts, Tops & Tunics           1249
Bras                            1036
Dresses & Skirts                 588
Kurtas & Kurtis                  202
Leggings & Jeggings              194
Numero Uno T-Shirts              135
Formal Shirts                    128
Fabric                           122
Casual & Party Wear Shirts       106
Sweatshirts                       92
Oviyon T-Shirts                   87
Ethnic Wear                       83
Okane T-Shirts                    82
Sweaters                          71
Ties                              70
Nimya T-Shirts                    50
Baby Boys                         49
Northern Lights T-Shirts          46
Camisoles & Slips                 45
Ocean Race T-Shirts               44
Nucode T-Shirts                   44
Jeans & Shorts                    43
Winter & Seasonal Wear            41
Jackets                           40
Sarees                            40
Night Dresses & Nighties          37
Jeans                         

In [None]:
c3_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5680 entries, 0 to 19788
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              5680 non-null   object
 1   Name            5680 non-null   object
 2   Description     5679 non-null   object
 3   Brand           2929 non-null   object
 4   image           5680 non-null   object
 5   c0_name         5680 non-null   object
 6   c1_name         5680 non-null   object
 7   c2_name         5680 non-null   object
 8   c3_name         5680 non-null   object
 9   Specifications  5675 non-null   object
dtypes: object(10)
memory usage: 488.1+ KB


In [None]:
c3_filtered_df.to_csv('gs://gke-dataprocessing-mvp-karajendran/flipkart_category_filtered_df.csv', index=False)

In [None]:
context_df = c3_filtered_df[[
                'Name',
                'Description',
                'c1_name',
                'Specifications']]

In [None]:
context_df.head(10)

Unnamed: 0,Name,Description,c1_name,Specifications
0,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
3,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
6,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
9,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
13,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
15,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
21,Alisha Solid Women's Cycling Shorts,Alisha Solid Women's Cycling Shorts - Buy Blac...,Women's Clothing,
22,dongli Printed Boy's Round Neck T-Shirt,Specifications of dongli Printed Boy's Round N...,Kids' Clothing,"{""Sleeve"": ""Half Sleeve"", ""Number of Contents ..."
28,FDT Women's Leggings,FDT Women's Leggings - Buy Parrot Green FDT Wo...,Women's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."
29,Madcaps C38GR30 Men's Cargos,Madcaps C38GR30 Men's Cargos - Buy Green Madca...,Men's Clothing,"{""Number of Contents in Sales Package"": ""Pack ..."


In [None]:
# Convert the dataframe to JSONL format
context_df.to_json('context.jsonl', orient='records')

In [None]:
# Data Format expected for fine tuning: {"context": " ", "question": " ", "answer": " "}
finetune_ds = pd.DataFrame(columns=['context', 'question', 'answer'])
finetune_ds['context'] = "Product Name: "+ context_df['Name']+ "<br> Product Category: "+ context_df['c1_name'] + "<br> Attributes: "+ context_df['Specifications'] +" <br> Description: "+ context_df['Description']


In [None]:
finetune_ds.head(10)

Unnamed: 0,context,question,answer
0,Product Name: Alisha Solid Women's Cycling Sho...,,
3,Product Name: Alisha Solid Women's Cycling Sho...,,
6,Product Name: Alisha Solid Women's Cycling Sho...,,
9,Product Name: Alisha Solid Women's Cycling Sho...,,
13,Product Name: Alisha Solid Women's Cycling Sho...,,
15,Product Name: Alisha Solid Women's Cycling Sho...,,
21,,,
22,Product Name: dongli Printed Boy's Round Neck ...,,
28,Product Name: FDT Women's Leggings<br> Product...,,
29,Product Name: Madcaps C38GR30 Men's Cargos<br>...,,


In [None]:
finetune_ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5680 entries, 0 to 19788
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   context   5674 non-null   object
 1   question  0 non-null      object
 2   answer    0 non-null      object
dtypes: object(3)
memory usage: 306.5+ KB


In [None]:
# Drop the rows where the 'context' column is null
finetune_ds = finetune_ds.dropna(subset=['context'])
finetune_ds.reset_index(drop=True, inplace=True)

In [None]:
# Drop the duplicates
finetune_ds = finetune_ds.drop_duplicates()
#finetune_ds.reset_index(drop=True, inplace=True)

In [None]:
finetune_ds.head(10)

Unnamed: 0,context,question,answer
0,Product Name: Alisha Solid Women's Cycling Sho...,,
1,Product Name: Alisha Solid Women's Cycling Sho...,,
2,Product Name: Alisha Solid Women's Cycling Sho...,,
3,Product Name: Alisha Solid Women's Cycling Sho...,,
4,Product Name: Alisha Solid Women's Cycling Sho...,,
5,Product Name: Alisha Solid Women's Cycling Sho...,,
6,Product Name: dongli Printed Boy's Round Neck ...,,
7,Product Name: FDT Women's Leggings<br> Product...,,
8,Product Name: Madcaps C38GR30 Men's Cargos<br>...,,
9,Product Name: Indcrown Net Embroidered Semi-st...,,


In [None]:
finetune_ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5463 entries, 0 to 5673
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   context   5463 non-null   object
 1   question  0 non-null      object
 2   answer    0 non-null      object
dtypes: object(3)
memory usage: 299.8+ KB


In [None]:
#Save the context into GCS
finetune_ds.context.to_csv('gs://gke-dataprocessing-mvp-karajendran/fine_tuning_ds_context.csv', index=False)

In [None]:
from math import nan
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models
import re
import time
import numpy as np
import pandas as pd
from datasets import load_dataset
generation_config = {
    "max_output_tokens": 200,
    "temperature": 0.7
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

num_questions = 3

def generate(context):
  vertexai.init(project="cloud-llm-preview1", location="us-central1")
  model = GenerativeModel(
    "gemini-1.5-flash-preview-0514",
  )

  prompt = f"Generate {num_questions} Search Queries in conversational tone and Answers for this product:\n{context}. Return the result without any formatting in a single line as Question : Answer"
  try:
    responses = model.generate_content(
        [prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )
    qa=''
    for response in responses:
      qa+=response.text
    #print (qa)

    # Define the pattern to match questions and answers
    pattern = r"Question : (.*?) : Answer : (.*?)(?=\nQuestion :|$)"  # $ for end of string

    # Extract questions and answers
    matches = re.findall(pattern, qa, re.DOTALL)
    #print(matches)

    # Create a DataFrame
    temp_df = pd.DataFrame(matches, columns=["Question", "Answer"])
    temp_df['Context'] = context
    return temp_df
  except Exception as e:
    print(e)
    return None

result = pd.DataFrame()
for context in finetune_ds['context'][2000:3000]:
  #print(context)
  if context!=np.nan:
    temp_df = generate(context)
    if not temp_df is None:
      result = pd.concat([result, temp_df], ignore_index=True)
    time.sleep(1) # Add a 1 second delay to avoid API rate limiting (adjust as needed)

# Now `result` contains all generated questions and answers
print(result)

Cannot get the response text.
Cannot get the Candidate text.
Response candidate content has no parts (and thus no text). The candidate is likely blocked by the safety filters.
Content:
{}
Candidate:
{
  "finish_reason": "SAFETY",
  "safety_ratings": [
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.10141132,
      "severity": "HARM_SEVERITY_NEGLIGIBLE",
      "severity_score": 0.05942822
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.055208683,
      "severity": "HARM_SEVERITY_NEGLIGIBLE",
      "severity_score": 0.089136936
    },
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.14584377,
      "severity": "HARM_SEVERITY_NEGLIGIBLE",
      "severity_score": 0.090253234
    },
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "probability": "MEDIUM",
     

In [None]:
result.drop_duplicates(inplace=True)

In [None]:
result

Unnamed: 0,Question,Answer,Context
0,I'm looking for comfy leggings for lounging ar...,"You might like the Rann Women's Leggings, they...",Product Name: Rann Women's Leggings<br> Produc...
1,"I need some casual leggings for working out, a...",The Rann Women's Leggings could be a good opti...,Product Name: Rann Women's Leggings<br> Produc...
2,Are there any solid colored leggings available...,"Yes, the Rann Women's Leggings are solid color...",Product Name: Rann Women's Leggings<br> Produc...
3,"I'm looking for a casual, sleeveless top for w...",You might like the FabAlley Casual Sleeveless ...,Product Name: FabAlley Casual Sleeveless Solid...
4,I need a new top for a casual outing. What ar...,The FabAlley Casual Sleeveless Solid Women's T...,Product Name: FabAlley Casual Sleeveless Solid...
5,I'm looking for a pink top to wear casually. ...,The FabAlley Casual Sleeveless Solid Women's T...,Product Name: FabAlley Casual Sleeveless Solid...
6,"I'm looking for a comfy, sleeveless top for ca...",The Urban Misty Casual Sleeveless Embellished...,Product Name: Urban Misty Casual Sleeveless Em...
7,"What's a good, stylish sleeveless top for women?",The Urban Misty Casual Sleeveless Embellished ...,Product Name: Urban Misty Casual Sleeveless Em...
8,"I need a new top for a casual occasion, someth...",The Urban Misty Casual Sleeveless Embellished ...,Product Name: Urban Misty Casual Sleeveless Em...
9,"I'm looking for a comfortable, casual top with...","Yes, it's made of lace, which is typically sof...",Product Name: Calgari Casual 3/4 Sleeve Solid ...


### Upload preprocessed data into GCS

In [None]:
result.to_csv('gs://gke-dataprocessing-mvp-karajendran/fine_tuning_ds3.csv', index=False)

In [None]:
#Delete Later

In [None]:
import pandas as pd
import re

def find_questions_answers(text):
  """ Extracts questions and answers from a given text.

  Args: text: The text to extract questions and answers from.

  Returns: A pandas DataFrame with columns 'questions' and 'answers'. """

  #Define the regular expression pattern to match questions and answers.
  pattern = r'Question[^:]:\s(.?)\n\nAnswer[^:]:\s*(.*?)(?=\n\n|$)'

  #Find all matches of the pattern in the text.
  matches = re.findall(pattern, text)

  #Create a list of tuples with the questions and answers.
  questions_answers = [(question, answer) for question, answer in matches]

  #Convert the list of tuples to a pandas DataFrame.
  df = pd.DataFrame(questions_answers, columns=['questions', 'answers'])

  return df