## Modelo FRANCES 1.4

### Características de este modelo:
* Análisis de existencia en la base de datos
* Identificación de propiedades únicas acumuladas
* Algoritmo de reducción de dimensionalidad UMAP para análisis de similitud
* Entrega sustituciones hechas y lista final 100% natural

### Nuevo:
* Entrega de múltiples alternativas de listas de ingredientes basados en distintas métricas de similitud

In [None]:
import pandas as pd
import numpy as np
import os 
import re
import umap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [None]:
os.getcwd()

'/Volumes/TFFG/WORK/DR. FRANCES/Códigos/Modelos'

In [None]:
# os.chdir("D:/")
os.chdir("/Volumes/TFFG/WORK/DR. FRANCES/")

In [None]:
data = pd.read_excel("Bases de datos/Oficiales/Ingredients Matrix.xlsx")
data.head()

Unnamed: 0,Ingredients,Properties,Natural,skin soothing,solvent,exfoliant,emollient,moisturizer,emulsifier,sunscreen agent,...,chelating,film forming,skin-protecting,skin brightening,foam boosting,hair conditioning,antiseptic,estructural alto,estructural bajo,propiedad particular
0,"1, 2-hexanediol","['solvent', 'preservative']",0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,3-o-ethyl ascorbic acid,"['antioxidant', 'skin brightening']",0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,1
2,behenic acid,['emollient'],1,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,acacia farnesiana flower extract,['fragrance'],1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,acer saccharum (sugar maple) extract,"['exfoliant', 'skin conditioning']",1,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


## Reducción de dimensionalidad UMAP

### Hacemos esto para que el modelo tenga otra forma de medir la similitud entre los ingredientes si es que queremos probar.

### Además, nos permite hacer un análisis de los ingredientes existentes en la base y descubrir clusters.

In [None]:
indices_columnas = data.columns
len(indices_columnas)

37

In [None]:
# UMAP transformation
umap_model = umap.UMAP(n_components=2, random_state=42, min_dist=0.1)
data_fit = data.iloc[:, 3:]
umap_features = umap_model.fit_transform(data_fit)
data_umap = data.copy()
data_umap['UMAP_X'] = umap_features[:, 0]
data_umap['UMAP_Y'] = umap_features[:, 1]
data_umap.head(5)



  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Unnamed: 0,Ingredients,Properties,Natural,skin soothing,solvent,exfoliant,emollient,moisturizer,emulsifier,sunscreen agent,...,skin-protecting,skin brightening,foam boosting,hair conditioning,antiseptic,estructural alto,estructural bajo,propiedad particular,UMAP_X,UMAP_Y
0,"1, 2-hexanediol","['solvent', 'preservative']",0,0,4,0,0,0,0,0,...,0,0,0,0,0,1,1,0,7.751597,8.727932
1,3-o-ethyl ascorbic acid,"['antioxidant', 'skin brightening']",0,0,0,0,0,0,0,0,...,0,4,0,0,0,0,0,1,12.201508,13.943052
2,behenic acid,['emollient'],1,0,0,0,4,0,0,0,...,0,0,0,0,0,1,0,0,7.855488,11.632416
3,acacia farnesiana flower extract,['fragrance'],1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,-22.888926,-4.807378
4,acer saccharum (sugar maple) extract,"['exfoliant', 'skin conditioning']",1,0,0,3,0,0,0,0,...,0,0,0,0,0,1,0,1,0.221697,-6.619018


### Usamos BOKEH para ver la distribución de manera interactiva

In [None]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

# Make a source and a scatter plot  
source = ColumnDataSource(data_umap)
plot = figure(x_axis_label = "UMAP 1", 
              y_axis_label = "UMAP 2",
              width = 500, height = 400)
plot.circle(x = "UMAP_X", 
    y = "UMAP_Y",
    source = source, 
    size = 10, color = '#FF7373', alpha = .8)

In [None]:
# Create a HoverTool object for hovering and seeing properties
hover = HoverTool(tooltips = [('Ingredient','@Ingredients'),
                              ('Natural','@Natural'),
                              ('Properties','@Properties')])
plot.add_tools(hover)

In [None]:
# Plot the map
show(plot)

# Comienzo del algoritmo de Análisis y Recomendación de Ingredientes 
#

# Funciones de Limpieza

In [None]:
#Importamos el diccionario

import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Dirección del diccionario a importar
## El diccionario permite realizar la transformación de nombres diferentes para un mismo ingrediente a su nombre estándar.
file_path = 'Bases de datos/Oficiales/Diccionario de Reemplazo.json'
replace_dict = load_json(file_path)

In [None]:
# Definimos una función con funciones anidadas que se ocupará de preprocesar, limpiar y estandarizar la lista de ingredientes

def process_ingredients(input_ingredients, replace_dict, data):

    # 1º Función: estandarización y limpieza de la lista de entrada
    def standardize_ingredients(ingredients):
        if isinstance(ingredients, str):
            standardized = []
            temp_ingredient = []
            parenthesis_count = 0
            i = 0

            while i < len(ingredients):
                char = ingredients[i]

                if char == '(':
                    parenthesis_count += 1
                elif char == ')':
                    parenthesis_count -= 1

                if char == ',' and parenthesis_count == 0:
                    # Verifica si la coma no está flanqueada por números específicos
                    if not (i > 0 and i < len(ingredients) - 2 and
                            ingredients[i - 1].isdigit() and 
                            ingredients[i + 1] == ' ' and 
                            ingredients[i + 2].isdigit() and 
                            (ingredients[i - 1], ingredients[i + 2]) == ('1', '2')):
                        # Finaliza el ingrediente actual y lo agrega a la lista
                        standardized.append(''.join(temp_ingredient).strip())
                        temp_ingredient = []
                        i += 1
                        continue

                temp_ingredient.append(char)
                i += 1

            # Agregar el último ingrediente si el buffer no está vacío
            if temp_ingredient:
                standardized.append(''.join(temp_ingredient).strip())

            # Procesar cada ingrediente para limpiar y formatear
            for i in range(len(standardized)):
                ingredient = standardized[i].lower().replace('*', '').replace('.', '')
                ingredient = re.sub(r'\s{2,}', ' ', ingredient)
                ingredient = re.sub(r'(?<=\S)\(', ' (', ingredient)
                ingredient = re.sub(r'\)(?=\S)', ') ', ingredient)
                standardized[i] = ingredient

            return standardized
        
        else:
            print("Please enter a valid list of ingredients")  # Retorna una lista vacía si el input no es un string


    
    # 2º función: Reemplazar ingredientes según diccionario de reemplazo
    def replace_ingredients(ingredients_list):
        new_ingredients_list = []
        for ingredient in ingredients_list:
            # Aquí entra en la lista de ingredientes dada y realiza todos los reemplazos que se encuentren en el diccionario
            new_ingredient = replace_dict.get(ingredient, ingredient)
            new_ingredients_list.append(new_ingredient)
        return new_ingredients_list
    
    # 3º función: Comprobar si los ingredientes están presentes en la base de datos
    def check_if_present(ingredients_list):
        data_ingredients = data['Ingredients'].to_list()
        missing_ingredients = [ingredient for ingredient in ingredients_list if ingredient not in data_ingredients]
        if missing_ingredients:
            print(f"\nThe following ingredients can't be found in the Ingredients Matrix: {', '.join(missing_ingredients)}")
        else:
            print("\nAll the ingredients are present in the Ingredients Matrix")
        return ingredients_list
    
    # Proceso de ejecución de las funciones anidadas
    standardized_ingredients = standardize_ingredients(input_ingredients)
    replaced_ingredients = replace_ingredients(standardized_ingredients)
    final_ingredients = check_if_present(replaced_ingredients)
    
    # Resultado de la estandarización
    return final_ingredients

## Función de Análisis de la Lista de Ingredientes dada

In [None]:
# Función de análisis de la lista de ingredientes dada: ingredientes naturales, no-naturales y propiedades presentes
def list_analisis(ingredients_list, data):
    # Se guardan todos los ingredientes naturales y artificiales en diccionarios
    natural_data = data[data['Natural'] == 1].set_index('Ingredients')['Natural'].to_dict()
    non_natural_data = data[data['Natural'] == 0].set_index('Ingredients')['Natural'].to_dict()
    ingredient_data = {**natural_data, **non_natural_data}  # Merge a los diccionarios


    # Separamos los ingredientes de la lista dada entre naturales y artificiales
    natural_ingredients = []
    non_natural_ingredients = []
    
    for ingredient in ingredients_list:
        if ingredient in ingredient_data:
            if ingredient_data[ingredient] == 1:
                natural_ingredients.append(ingredient)
            else:
                non_natural_ingredients.append(ingredient)

    print("Analisys of the ingredients in the list:\n")
    print(f"Natural ingredients: {', '.join(natural_ingredients)}\n")
    print(f"Non-natural ingredients: {', '.join(non_natural_ingredients)}")

            

    # Recopilamos las propiedades sin repetir de los ingredientes de la lista dada
    ingredients_list_data = data[data['Ingredients'].isin(ingredients_list)]
    if not ingredients_list_data.empty:  # Check if DataFrame is empty
        mask = (ingredients_list_data.iloc[:, 3:-3] >= 1).any(axis=0)
        true_properties = ingredients_list_data.columns[3:-3][mask].tolist()
        print (f"\nThe unique properties of the given ingredients are: {true_properties}")


## Función de Reemplazo por Similitud
#### Entrega 10 recomendaciones de listas de ingredientes, de mayor similitud de coseno en el reeplazo a menor similitud de coseno.

In [None]:
import pandas as pd
from scipy.spatial.distance import cdist

# 1º Función: Clasificar los ingredientes de la lista dada entre naturales y artificiales
def get_natural_ingredients(ingredients, matrix):
    natural_ingredients = []
    artificial_ingredients = []
    for ingredient in ingredients:
        if matrix.loc[matrix['Ingredients'] == ingredient, 'Natural'].values[0] == 1:
            natural_ingredients.append(ingredient)
        else:
            artificial_ingredients.append(ingredient)
    return natural_ingredients, artificial_ingredients

# 2º Función: Obtener los vectores de las propiedades de cada uno de los ingredientes artificiales dados
def get_property_vectors(artificial_ingredients, matrix):
    property_vectors = {}
    for ingredient in artificial_ingredients:
        properties = matrix.loc[matrix['Ingredients'] == ingredient, matrix.columns[2:-3]].values[0]
        roles = matrix.loc[matrix['Ingredients'] == ingredient, matrix.columns[-3:]].values[0]
        property_vector = list(properties) + list(roles)
        property_vectors[ingredient] = property_vector
    return property_vectors


# 3º Función: mediante una métrica de distancia (ej: coseno), obtener los valores de similitud entre los artificiales y los naturales de la base
## Se queda con los top 10 en puntaje de similitud y crea un diccionario con estos reemplazos para cada ingrediente artificial
def get_replacements(property_vectors, matrix):
    replacements = {}
    for ingredient, vector in property_vectors.items():
        natural_rows = matrix[matrix['Natural'] == 1]
        natural_vectors = natural_rows[matrix.columns[2:]].values
        distances = cdist([vector], natural_vectors, metric='cosine')[0]
        top_indices = distances.argsort()[:10]
        replacements[ingredient] = list(natural_rows.iloc[top_indices]['Ingredients'])
    return replacements

# 4º Función: Elabora las recomendaciones de ingredientes a partir de los resultados de la 3º Función. 
## Ingresa en el diccionario elaborado en la 3º Función, copia la fórmula original, y para cada caso del 1 al 10, inserta el reemplazo.
def create_recommendations(ingredients, replacements):
    recommendations = []
    # Deberíamos generar 10 recomendaciones (para cada uno de los posibles reemplazos)
    for idx in range(10):
        recommendation = ingredients.copy()
        for i, ingredient in enumerate(recommendation):
            if ingredient in replacements:
                # Reemplazamos con el idx-ésimo reemplazo para cada ingrediente artificial
                recommendation[i] = replacements[ingredient][idx]
        recommendations.append(recommendation)
    return recommendations


# 5º Función: Imprimir los resultados
def print_results(ingredients, natural_ingredients, artificial_ingredients, replacements, recommendations):
    print("RECOMENDACIÓN PARA REPLICAR EL PRODUCTO ENTREGADO\n")
    print("Ingredientes naturales: ", natural_ingredients)
    print("\nIngredientes artificiales: ", artificial_ingredients)
    print("\nReemplazos:\n")
    for ingredient, replacements in replacements.items():
        print(f"\nIngrediente artificial: '{ingredient}' -> {', '.join(replacements)}")
    print("\nTop 10 recomendaciones:\n")
    for i, recommendation in enumerate(recommendations, 1):
        print(f"{i}. {recommendation}\n")


# Celdas de Ejecución

## Ingresar y limpiar lista de ingredients

In [None]:
# Solicitar la lista de ingredientes al usuario
test_list = input("Please enter your ingredients list: \n")#.split(",")

# Llamar a la función limpieza_ingredientes
clean_list = process_ingredients(test_list, replace_dict, data)

if not clean_list:
  print("Your list has problems, please correct it and try again.")
else:
  print(f"\nYour clean and standardized list of ingredients is: {clean_list}")


All the ingredients are present in the Ingredients Matrix

Your clean and standardized list of ingredients is: ['water', 'synthetic wax', 'lactic acid', 'cetearyl alcohol', 'hydrated silica', 'cetyl alcohol', 'stearyl alcohol', 'helianthus annuus (sunflower) seed cera/extract/oil', 'glycerin', 'sodium hydroxide', 'fragrance', 'lauryl laurate', 'salicylic acid', 'xanthan gum', 'hydroxyethyl acrylate/sodium acryloyl dimethyl taurate copolymer', 'ethylhexyl glycerin', 'sodium benzoate', 'glycolic acid', 'passiflora edulis seed powder (passion fruit/maracuja)', 'potassium sorbate', 'citric acid', 'bambusa arundinacea stem extract', 'sodium phytate', 'ethyl brassilate', 'sorbitan isostearate', 'vaccinium myrtillus (bilberry) fruit extract', 'ananas sativus (pineapple) fruit extract', 'saccharum officinarum (sugar cane) extract', 'carica papaya (papaya) fruit extract', 'mangifera indica (mango) fruit extract', 'citrus aurantium dulcis (orange) fruit extract /oil', 'citrus limon (lemon) frui

## Analizar los ingredientes

In [None]:
list_analisis(clean_list, data)


Analisys of the ingredients in the list:

Natural ingredients: water, lactic acid, cetearyl alcohol, hydrated silica, cetyl alcohol, stearyl alcohol, helianthus annuus (sunflower) seed cera/extract/oil, glycerin, lauryl laurate, passiflora edulis seed powder (passion fruit/maracuja), citric acid, bambusa arundinacea stem extract, sodium phytate, vaccinium myrtillus (bilberry) fruit extract, ananas sativus (pineapple) fruit extract, saccharum officinarum (sugar cane) extract, carica papaya (papaya) fruit extract, mangifera indica (mango) fruit extract, citrus aurantium dulcis (orange) fruit extract /oil, citrus limon (lemon) fruit extract, haematococcus pluvialis oil, leuconostoc ferment filtrate/radish root, lactobacillus ferment, acer saccharum (sugar maple) extract, astaxanthin, rosmarinus officinalis (rosemary) extract

Non-natural ingredients: synthetic wax, sodium hydroxide, fragrance, salicylic acid, xanthan gum, hydroxyethyl acrylate/sodium acryloyl dimethyl taurate copolymer, e

## Algoritmo de Recomendación

In [None]:
# Ejecutar el programa
natural_ingredients, artificial_ingredients = get_natural_ingredients(clean_list, data)

property_vectors = get_property_vectors(artificial_ingredients, data)

replacements = get_replacements(property_vectors, data)

recommendations = create_recommendations(clean_list, replacements)

print_results(clean_list, natural_ingredients, artificial_ingredients, replacements, recommendations)


RECOMENDACIÓN PARA REPLICAR EL PRODUCTO ENTREGADO

Ingredientes naturales:  ['water', 'lactic acid', 'cetearyl alcohol', 'hydrated silica', 'cetyl alcohol', 'stearyl alcohol', 'helianthus annuus (sunflower) seed cera/extract/oil', 'glycerin', 'lauryl laurate', 'passiflora edulis seed powder (passion fruit/maracuja)', 'citric acid', 'bambusa arundinacea stem extract', 'sodium phytate', 'vaccinium myrtillus (bilberry) fruit extract', 'ananas sativus (pineapple) fruit extract', 'saccharum officinarum (sugar cane) extract', 'carica papaya (papaya) fruit extract', 'mangifera indica (mango) fruit extract', 'citrus aurantium dulcis (orange) fruit extract /oil', 'citrus limon (lemon) fruit extract', 'haematococcus pluvialis oil', 'leuconostoc ferment filtrate/radish root', 'lactobacillus ferment', 'acer saccharum (sugar maple) extract', 'astaxanthin', 'rosmarinus officinalis (rosemary) extract']

Ingredientes artificiales:  ['synthetic wax', 'sodium hydroxide', 'fragrance', 'salicylic acid', '