In [1]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

In [2]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [6]:
# Reading the dataset
alexator_df = pd.read_csv('../data/Alexator/alexator_stickers_desc.csv')
flaticon_df = pd.read_csv('../data/flaticon/flaticon_stickers_desc.csv')
freepik_df = pd.read_csv('../data/freepik/freepik_stickers_desc.csv')

In [7]:
alexator_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4331 entries, 0 to 4330
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   filename     4331 non-null   object
 1   description  4331 non-null   object
dtypes: object(2)
memory usage: 67.8+ KB


In [8]:
flaticon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30402 entries, 0 to 30401
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image        30402 non-null  object
 1   title        30402 non-null  object
 2   tags         30402 non-null  object
 3   filename     30402 non-null  object
 4   description  30402 non-null  object
dtypes: object(5)
memory usage: 1.2+ MB


In [9]:
freepik_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9844 entries, 0 to 9843
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        0 non-null      float64
 1   tags         9814 non-null   object 
 2   image        9844 non-null   object 
 3   filename     9844 non-null   object 
 4   description  9844 non-null   object 
dtypes: float64(1), object(4)
memory usage: 384.7+ KB


In [10]:
# Concatenate the DataFrames row-wise
merged_df = pd.concat([alexator_df, flaticon_df, freepik_df], ignore_index=True)

# Display the concatenated DataFrame
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44577 entries, 0 to 44576
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   filename     44577 non-null  object
 1   description  44577 non-null  object
 2   image        40246 non-null  object
 3   title        30402 non-null  object
 4   tags         40216 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


## Handling duplicates in description and tags combination


In [11]:
# Replace duplicate descriptions with Null (Because most of them don't make sense and are incorrect)
duplicate_descriptions = merged_df['description'].duplicated(keep=False)
merged_df.loc[duplicate_descriptions, 'description'] = None

"""Some generated descriptions have errors such as "a big boy boy boy boy boy...."
So, we want to remove such descriptions that have incorrect descriptions
"""

"""
Reference: ChatGPT-4o
Prompt:  I need to check for a certain type of error and remove those rows in my dataframe. The error is that in some description, certain words are repeated multiple time consecutively. Example: "happy man with two polar polar polar polar polar polar polar polar polar polar polar polar polar polar polar polar polar"
"""
# Replace descriptions with repeated words with Null
def has_repeated_words(text):
    if not text or pd.isna(text):  # Skip null or empty strings
        return False
    
    # Regex to find words repeated consecutively at least 3 times (case insensitive)
    pattern = r'\b(\w+)(?:\s+\1){2,}\b'
    return bool(re.search(pattern, text, re.IGNORECASE))

merged_df.loc[merged_df['description'].apply(has_repeated_words), 'description'] = None

# Remove rows with exact same tags AND description, keeping the first occurrence (We don't want different immages with same details)
cleaned_df = merged_df.drop_duplicates(subset=['tags', 'description'], keep='first')

# Check the new number of rows
print(f"Number of rows after removing rows with exact same tags and descriptions: {cleaned_df.shape[0]}")

Number of rows after removing rows with exact same tags and descriptions: 38807


In [12]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38807 entries, 0 to 44576
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   filename     38807 non-null  object
 1   description  26858 non-null  object
 2   image        35101 non-null  object
 3   title        27135 non-null  object
 4   tags         35091 non-null  object
dtypes: object(5)
memory usage: 1.8+ MB


## Handling Tags and Descriptions

In [13]:
def remove_duplicates(text):
    if not isinstance(text, str) or pd.isna(text) or text.strip().lower() == "nan":  
        return ""  # Return empty string for NaN or "nan" strings
    words = [word.strip() for word in text.split(",")]  # Split by commas and strip spaces
    unique_words = sorted(set(word.strip() for word in text.split(",")))
    return ', '.join(unique_words)  # Join back into a string

In [14]:
# Remove duplicates
final_df = cleaned_df.copy()
final_df["tags"] = final_df["tags"].progress_apply(remove_duplicates)

100%|██████████| 38807/38807 [00:00<00:00, 502343.56it/s]


In [None]:
### Need to conduct spelling correction in description column here as well

## Embedding using Sentence BERT