In [None]:
import sys, os
sys.path.append(os.path.join(os.path.dirname('__file__'), '..', 'DB_and_Azure'))
import sql_db_functions as SQLf

### get data from server

In [None]:
conn, cursor = SQLf.sql_db_functions.connect_sql()

In [None]:
query = "SELECT product_characteristics.id, Brand_id , Detail, Summary, Brand FROM product_characteristics INNER JOIN Products ON product_characteristics.Brand_id = Products.Brand_Prod_id ;"
cursor.execute(query)

# Fetch the rows
rows = cursor.fetchall()
rows

In [None]:
conn, cursor = SQLf.sql_db_functions.connect_sql()

In [None]:
query = "SELECT Brand_id, image_link FROM product_img ;"
cursor.execute(query)

# Fetch the rows
rows_img = cursor.fetchall()
#rows_img

In [None]:
len(rows_img)

In [None]:
SQLf.sql_db_functions.close_connection_db(conn=conn,cursor=cursor)

### organize data

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(rows ,columns= ['Id','prod_id','descripcion', 'Summary','Brand'])

In [None]:
df.head()

In [None]:
df['descripcion'] = df['descripcion'].str.replace('\n*', ' / ')
df['descripcion'] = df['descripcion'].str.replace(r'\s+', ' ')
df['descripcion'] = df['descripcion'].str.replace('*', ' ')

df.head(5)

In [None]:
df['Complete_description'] = df['Summary'] + ' /' + "Brand: " + df['Brand'] + " /" + df['descripcion']
df.head()

In [None]:
documents = df['Complete_description'].tolist()
id = df['Id'].astype('str').tolist()
metadata = [{"source": id} for id in df['Id'].astype('str').tolist()]

In [None]:
df_image = pd.DataFrame(rows_img, columns=['prod_id', 'Image'])

In [None]:
df_image['Image'] = df_image['Image'].apply(lambda x: x if 'https' in x else 'https:' +  x )

In [None]:
df_image.head()

In [None]:
del rows, rows_img

### Get images 

In [None]:
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import base64
from fake_useragent import UserAgent

import time

In [None]:
def img_to_base64(url):
    ua = UserAgent()
    header = {'User-Agent': str(ua.chrome)}
    max_retries = 10
    retry_delay = 15  # in seconds

    for attempt in range(max_retries + 1):
        try:
            response = requests.get(url, headers=header, stream=True, timeout=5)
            response.raise_for_status()  # Raise an exception for bad status codes
            break
        except (requests.exceptions.RequestException, ValueError) as e:
            if attempt < max_retries:
                print(f"Attempt {attempt+1} failed. Retrying in {retry_delay} seconds...")
                print(url)
                time.sleep(retry_delay)
            else:
                print(f"All {max_retries} attempts failed. Giving up.")
                return 'Retry'

    img = Image.open(BytesIO(response.content))

    buffered = BytesIO()
    if url.lower().endswith('.png'):
        img.save(buffered, format="PNG")
        return "data:image/png;base64, " + str(base64.b64encode(buffered.getvalue()))[2:-1]
    else:
        img.save(buffered, format="JPEG")
        return "data:image/jpeg;base64, " + str(base64.b64encode(buffered.getvalue()))[2:-1]



In [None]:
df_image.sort_index(ascending=True,inplace=True)
df_image.reset_index(inplace=True,drop=True)
df_image.head()

In [None]:
data_embedded = []


for i in range(0, len(df_image)-1):

    current_image = df_image.Image.iloc[i]

    v = img_to_base64(current_image)

    data_embedded = data_embedded + [v]

    if i % 10 == 0: print(i)



In [None]:
data_embedded[200]

### Testing embeddings

In [None]:
df_image['base64'] = data_embedded

In [None]:
df_image.iloc[0].base64[23:]

In [None]:
def base64_to_image(base64_string):
    # Remove the data URI prefix if present
    if "data:image" in base64_string:
        base64_string = base64_string.split(",")[1]

    # Decode the Base64 string into bytes
    image_bytes = base64.b64decode(base64_string)
    return image_bytes

In [None]:
image_bytes = base64.b64decode(df_image.iloc[0].base64[23:])

In [None]:
def create_image_from_bytes(image_bytes):
    # Create a BytesIO object to handle the image data
    image_stream = BytesIO(image_bytes)

    # Open the image using Pillow (PIL)
    image = Image.open(image_stream)
    return image

In [None]:
df_image.head()

In [None]:
for i in df_image[df_image['base64'] == 'Retry'].index:
    print(i)

In [None]:
data_embedded = []


for i in df_image[df_image['base64'] == 'Retry'].index:

    current_image = df_image.Image.iloc[i]

    v = img_to_base64(current_image)

    data_embedded = data_embedded + [v]

    print(i)


In [None]:
df_image.loc[33,'base64'] = data_embedded[0]
df_image.loc[38,'base64'] = data_embedded[1]
df_image.loc[52,'base64'] = data_embedded[2]

In [None]:
print(df_image.iloc[33].base64 == data_embedded[0])
print(df_image.iloc[38].base64 == data_embedded[1])
print(df_image.iloc[52].base64 == data_embedded[2])

### Save embeddings in the DB

In [None]:
df_image.iloc[200:220]

In [None]:
import psycopg2

In [None]:
conn, cursor = SQLf.sql_db_functions.connect_sql()

In [None]:
for index, row in df_image.iterrows():
    base64 = row['base64']
    
    # Update the base64 column in the product_img table
    cursor.execute("UPDATE product_img SET base64 = %s WHERE id = %s", (base64, index))

In [None]:
query = "SELECT * FROM product_img ;"
cursor.execute(query)

# Fetch the rows
rows_img = cursor.fetchall()
rows_img

In [None]:
conn.commit()