In [4]:
import pandas as pd
import numpy as np

# Import required modules from scikit-learn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors

# Import the SentenceTransformer model for text embeddings
from sentence_transformers import SentenceTransformer

# ------------------ Data Preprocessing & Model Building ------------------

# 1. Load the dataset
# The 'LM Internal Reference' column is used only for identification, not for training.
df = pd.read_excel('/Users/alguov/Desktop/Iron Hack/ML Project/Audit_media_cleaned.xlsx')

# Exclude rows where the reference starts with "48" or "49"
df = df[~df['LM Internal Reference'].astype(str).str.startswith(('48', '49'))]

# --- Imputation for Categorical Features ---
df['Sección'] = df['Sección'].fillna('Unknown')
df['Tipo'] = df['Tipo'].fillna('Unknown')
df['Letra de gama'] = df['Letra de gama'].fillna('')

# Exclude rows where 'Letra de Gama' is empty (after stripping spaces)
df = df[df['Letra de gama'].str.strip() != '']

# Exclude rows with 'Letra de Gama' values 'E', 'L', or 'S'
exclude_letters = ['E', 'L', 'S']
df = df[~df['Letra de gama'].isin(exclude_letters)]

# --- Imputation for Numerical Features ---
num_cols = [
    'Producto empaquetado: anchura (en cm)',
    'Producto empaquetado: altura (en cm)',
    'Producto empaquetado: profundidad (en cm)',
    'Peso neto (kg) para aduanas'
]
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

# Reset the DataFrame index so that indices are consecutive
df = df.reset_index(drop=True)

# 2. Process Categorical Features: 'Sección' and 'Tipo'
encoder_cat = OneHotEncoder(sparse_output=False)
cat_features = encoder_cat.fit_transform(df[['Sección', 'Tipo']])

# 3. Process Numerical Features
scaler = StandardScaler()
num_features_scaled = scaler.fit_transform(df[num_cols])

# 4. Process Text Feature: 'Designación Administrativa' using Sentence Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
df['Designación Administrativa'] = df['Designación Administrativa'].fillna('').astype(str)
text_features = embedding_model.encode(df['Designación Administrativa'].tolist())

# 5. Process the 'Letra de Gama' Feature
encoder_rank = OneHotEncoder(sparse_output=False)
rank_features = encoder_rank.fit_transform(df[['Letra de gama']])

# 6. Combine All Features into a Single Feature Vector per Product
# Note: The 'LM Internal Reference' column is not used for training.
features = np.hstack([cat_features, num_features_scaled, text_features, rank_features])

# 7. Build the k-Nearest Neighbors Model Using Cosine Similarity
k = 5  # Number of neighbors to recommend
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(features)

# Function to get recommendations based on a product reference
def get_recommendations(reference):
    """
    Given a product reference, this function finds the corresponding product
    in the dataset and returns a DataFrame with the recommended products.
    """
    # Find the index of the product with the given reference
    product_index = df.index[df['LM Internal Reference'].astype(str) == str(reference)].tolist()
    if not product_index:
        return None
    idx = product_index[0]
    # Get the k nearest neighbors for the product at index idx
    distances, indices = knn.kneighbors([features[idx]])
    # Return recommended products with their reference, section, and administrative designation
    recommended = df.loc[indices[0], ['LM Internal Reference', 'Sección', 'Designación Administrativa']]
    return recommended

# -------------------------- Flask Web App --------------------------

from flask import Flask, request, render_template_string

app = Flask(__name__)

# Simple HTML template for the web interface
template = """
<!doctype html>
<html lang="en">
  <head>
    <title>Product Recommender</title>
    <style>
      table, th, td {
        border: 1px solid black;
        border-collapse: collapse;
        padding: 8px;
      }
    </style>
  </head>
  <body>
    <h1>Product Recommender</h1>
    <form method="POST">
      <label for="ref">Enter product reference:</label>
      <input type="text" id="ref" name="ref" required>
      <button type="submit">Get Recommendations</button>
    </form>
    {% if error %}
      <p style="color:red">{{ error }}</p>
    {% endif %}
    {% if recommendations %}
      <h2>Recommended Products:</h2>
      <table>
        <tr>
          <th>LM Internal Reference</th>
          <th>Sección</th>
          <th>Designación Administrativa</th>
        </tr>
        {% for row in recommendations %}
        <tr>
          <td>{{ row['LM Internal Reference'] }}</td>
          <td>{{ row['Sección'] }}</td>
          <td>{{ row['Designación Administrativa'] }}</td>
        </tr>
        {% endfor %}
      </table>
    {% endif %}
  </body>
</html>
"""

@app.route('/', methods=['GET', 'POST'])
def index():
    recommendations = None
    error = None
    if request.method == 'POST':
        ref = request.form.get('ref')
        recs = get_recommendations(ref)
        if recs is None or recs.empty:
            error = "Product reference not found."
        else:
            recommendations = recs.to_dict(orient='records')
    return render_template_string(template, recommendations=recommendations, error=error)

if __name__ == '__main__':
    # Set use_reloader=False to avoid reloading conflicts (especially in environments like Jupyter)
    app.run(debug=True, port=5001, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5001
Press CTRL+C to quit
127.0.0.1 - - [08/Mar/2025 12:24:28] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 12:24:28] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [08/Mar/2025 12:24:37] "POST / HTTP/1.1" 200 -
