# Load the data

In [20]:
%%capture
!pip install -U fashion-clip

In [26]:
import sys
# sys.path.append("fashion-clip/")
from fashion_clip.fashion_clip import FashionCLIP
import pandas as pd
import numpy as np
from collections import Counter
from PIL import Image
import requests
from io import BytesIO
import zipfile

In [22]:
%%capture
fclip = FashionCLIP('fashion-clip')

In [23]:
# Link to the zip file
url = "https://drive.google.com/file/d/1Y-CseR1n75De_eCN9aCVEIRlqY_MmqjP/view"

# Extract the file ID from the link
file_id = url.split("/")[-2]

# Create the download link
download_link = f"https://drive.google.com/uc?id={file_id}"

In [27]:
# Download the content of the zip file
response = requests.get(download_link)
zip_data = BytesIO(response.content)

# Unzip the file
with zipfile.ZipFile(zip_data, 'r') as zip_ref:
    # Get the list of files in the zip
    file_list = zip_ref.namelist()

    # Choose the Excel file from the list (assuming only one Excel file)
    excel_file = [file for file in file_list if file.endswith('.xlsx')][0]

    # Read data from the Excel file using pandas
    df = pd.read_excel(zip_ref.open(excel_file))

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,price,description,review_count,avg_rating,scraped_at,color,brand,currency,url,availability,images
0,2572,Adilette Comfort Slides,32.0,Classic sport sandals with a lightweight feel....,11534.0,4.7,12/13/2023 20:47,Core Black / Cloud White / Cloud White,Adidas,USD,https://www.adidas.com/us/adilette-comfort-sli...,INSTOCK,"['https://assets.adidas.com/images/w_600,f_aut..."
1,52,Nike Sportswear Swoosh Puffer PrimaLoft®,225.97,Stay warm on the coldest days by bundling up i...,18.0,44.0,2023-12-13,White/Black,Nike,USD,https://www.nike.com/t/sportswear-swoosh-puffe...,InStock,https://static.nike.com/a/images/t_default/e5c...
2,500,Jordan Artist Series by Jordan Moss,66.97,Jordan Moss is a Brooklyn based illustrator an...,0.0,0.0,2023-12-14,Dark Driftwood,Nike,USD,https://www.nike.com/t/jordan-artist-series-by...,InStock,https://static.nike.com/a/images/t_default/u_1...
3,622,X_PLRPHASE Shoes Kids,80.0,Comfortable sneakers made in part with recycle...,53.0,4.6,12/14/2023 0:17,Cloud White / Magic Grey Met / Bliss Lilac,Adidas,USD,https://www.adidas.com/us/x_plrphase-shoes-kid...,INSTOCK,"['https://assets.adidas.com/images/w_600,f_aut..."
4,1681,Gazelle Bold Shoes,120.0,A platform shoe that adds modern looks to an i...,408.0,4.6,12/13/2023 19:45,Cloud White / Cloud White / Cloud White,Adidas,USD,https://www.adidas.com/us/gazelle-bold-shoes/I...,INSTOCK,"['https://assets.adidas.com/images/w_600,f_aut..."


In [29]:
df.shape

(6272, 13)

In [30]:
df.columns

Index(['Unnamed: 0', 'name', 'price', 'description', 'review_count',
       'avg_rating', 'scraped_at', 'color', 'brand', 'currency', 'url',
       'availability', 'images'],
      dtype='object')

Rename the 'Unnamed: 0' into ID

In [31]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

Combine to column for more information about the product

In [32]:
# Concatenate 'description' and 'color' columns with a separator
df['description'] = 'Color of product: ' + df['color'].astype(str) + '. ' + df['description']

In [33]:
# Drop the unnessary columns from the DataFrame
df = df.drop(columns=['price', 'review_count', 'avg_rating', 'scraped_at', 'color', 'brand', 'currency', 'availability'])

In [34]:
# Drop null, duplicates
df = df.dropna(subset=['images', 'description', 'name'])

df = df.drop_duplicates()

df = df.reset_index(drop=True)

In [35]:
df.isnull().sum()

ID             0
name           0
description    0
url            0
images         0
dtype: int64

In [36]:
check_duplicates = lambda df: df[df.duplicated()]

duplicates = check_duplicates(df)

if duplicates.empty:
    print("No duplicates found.")
else:
    print("Duplicates in:")
    print(duplicates)

No duplicates found.


In [37]:
df.head(5)

Unnamed: 0,ID,name,description,url,images
0,2572,Adilette Comfort Slides,Color of product: Core Black / Cloud White / C...,https://www.adidas.com/us/adilette-comfort-sli...,"['https://assets.adidas.com/images/w_600,f_aut..."
1,52,Nike Sportswear Swoosh Puffer PrimaLoft®,Color of product: White/Black. Stay warm on th...,https://www.nike.com/t/sportswear-swoosh-puffe...,https://static.nike.com/a/images/t_default/e5c...
2,500,Jordan Artist Series by Jordan Moss,Color of product: Dark Driftwood. Jordan Moss ...,https://www.nike.com/t/jordan-artist-series-by...,https://static.nike.com/a/images/t_default/u_1...
3,622,X_PLRPHASE Shoes Kids,Color of product: Cloud White / Magic Grey Met...,https://www.adidas.com/us/x_plrphase-shoes-kid...,"['https://assets.adidas.com/images/w_600,f_aut..."
4,1681,Gazelle Bold Shoes,Color of product: Cloud White / Cloud White / ...,https://www.adidas.com/us/gazelle-bold-shoes/I...,"['https://assets.adidas.com/images/w_600,f_aut..."


# Text preprocessing

In [None]:
df['description'].iloc[0]

'Color of product: Medium Grey Heather. Quick-drying socks for your daily workout.Leave the sweat behind you. These training socks are cushioned at the heel and toe to keep you comfortable while you pound the pavement. They have a stretchy, moisture-wicking construction that hugs the foot at the arch.'

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
con_appos = {
    "rep": "represent",
    "hoodie": "hooded sweatshirt",
    "tee": "t-shirt",
    "zip": "zip-up",
    "maxi": "maxi dress",
    "mini": "mini dress",
    "midi": "midi dress",
    "denim": "denim fabric",
    "chino": "chino pants",
    "cropped": "cropped length",
    "floral": "floral print",
    "striped": "striped pattern",
    "polka": "polka dot",
    "v-neck": "v-neckline",
    "crew-neck": "crew neckline",
    "slim": "slim fit",
    "loose": "loose fit",
    "bootcut": "bootcut style",
    "skinny": "skinny fit",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "we're": "we are",
    "to've": "to have"
}

In [None]:
def preprocessed_description(text):
    # Ensure the input text is a string
    text = str(text)

    # Convert the text to lowercase
    text = text.lower()

    # Change abbreviated words into full words
    pattern = re.compile(r'\b(' + r'|'.join(con_appos.keys()) + r')\b')
    text = pattern.sub(lambda x: con_appos[x.group()], text)

    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)

    # Remove html tags
    Tag_re = re.compile(r'<[^>]+>')
    text  = Tag_re.sub(' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove stopwords
    stopwords_list = [word for word in stopwords.words('english') if word != 'not']
    pattern = re.compile(r'\b(' + r'|'.join(stopwords_list) + r')\b\s*')
    text = pattern.sub('', text)

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text))

    return text

In [None]:
# Apply preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocessed_description)

In [None]:
df['description'].iloc[0]

'color product medium grey heather quick drying sock daily workout leave sweat behind training sock cushioned heel toe keep comfortable pound pavement stretchy moisture wicking construction hug foot arch'

# Images preprocessing

In [None]:
df['images'].loc[0]

"['https://assets.adidas.com/images/w_600,f_auto,q_auto/52052736a9894fd8a968a831002d9dd0_9366/Athletic_Cushioned_No-Show_Socks_6_Pairs_Grey_BH9578_03_standard.jpg', 'https://assets.adidas.com/images/w_600,f_auto,q_auto/43fa1c06675c4a58adc5aca0018aab7a_9366/Athletic_Cushioned_No-Show_Socks_6_Pairs_Grey_BH9578_41_detail_hover.jpg', 'https://assets.adidas.com/images/w_600,f_auto,q_auto/1e46e7d880374fb7b4d6aca0018affa9_9366/Athletic_Cushioned_No-Show_Socks_6_Pairs_Grey_BH9578_42_detail.jpg', 'https://assets.adidas.com/images/w_600,f_auto,q_auto/39074b0b9d1a48dab24aaca1000046fd_9366/Athletic_Cushioned_No-Show_Socks_6_Pairs_Grey_BH9578_43_detail.jpg']"

Convert link str into list

In [None]:
import ast

def convert_to_list(link_str):
    try:
        # Check if the link_str contains "~" delimiter
        if "~" in link_str:
            link_list = link_str.split("~")
        else:
            # Convert the string representation of a list to an actual list
            link_list = ast.literal_eval(link_str)

        # Strip whitespaces around each link
        return [link.strip() for link in link_list]
    except (SyntaxError, ValueError):
        print(f"Error converting {link_str} to a list")
        return None

In [None]:
df['images'] = df['images'].apply(lambda x: convert_to_list(x))

In [None]:
df = df.dropna(subset=['images']).reset_index(drop=True)

In [None]:
def filter_invalid_urls(image_urls):
    # Filter out URLs that are base64-encoded images
    valid_urls = [url for url in image_urls if 'data:image' not in url]
    return valid_urls

# Example usage:
df['images'] = df['images'].apply(filter_invalid_urls)

In [None]:
import concurrent.futures
import requests

def is_valid_url(url):
    try:
        # Check if the URL starts with a valid protocol
        if url.startswith(('http://', 'https://')):
            # Check if the URL is reachable (status code 200)
            response = requests.head(url, timeout=5)  # Set a timeout to avoid long waits
            return response.status_code == 200
        else:
            print(f"Invalid URL format: {url}")
            return False
    except requests.RequestException as e:
        print(f"Error checking {url}: {e}")
        return False

def filter_valid_urls(image_urls):
    valid_urls = []

    # Iterate through each URL in the list
    for url in image_urls:
        # Check if the URL is valid
        if is_valid_url(url):
            # Add the valid URL to the list and break the loop
            valid_urls.append(url)
            break

    return valid_urls

# Example usage:
df['images'] = df['images'].apply(filter_valid_urls)

In [None]:
df['images'].iloc[345]

['https://static.nike.com/a/images/t_default/f727409b-e588-4825-899b-01111af7a30c/florida-back-2-school-menscrew-neck-long-sleeve-t-shirt-BzzQ7f.png']

In [None]:
# Save the preprocessed DataFrame to a pickle file
df.to_pickle('preprocessed_data.pkl')

In [None]:
df = pd.read_pickle('preprocessed_data.pkl')

In [None]:
df.shape

(5918, 5)