In [9]:
import pandas as pd
import joblib
from sklearn.metrics.pairwise import euclidean_distances
import ast

class SeedRecommender:
    def __init__(self, data_path, scaler_path, main_kmeans_path, sub_kmeans_models_path):
        self.df = pd.read_pickle(data_path)
        self.X_columns = ['temperature', 'soil_temperature', 'soil_moisture', 'precipitation', 'sunshine_duration', 'humid']

        if isinstance(self.df['growing_countries'].iloc[0], str):
            try:
                self.df['growing_countries'] = self.df['growing_countries'].apply(ast.literal_eval)
            except:
                print("Error parsing growing_countries — check format.")

        self.scaler = joblib.load(scaler_path)
        self.main_kmeans = joblib.load(main_kmeans_path)
        self.sub_kmeans_models = joblib.load(sub_kmeans_models_path)

        self.subcluster_map = {
            '0_0': 1, '0_1': 2,
            '1_0': 3, '1_1': 4,
            '2_0': 5, '2_1': 6,
            '3_0': 7, '3_1': 8,
        }

    def recommend(self, new_data, country=None, top_n=None):
        new_scaled = self.scaler.transform([new_data])

        main_cluster_id = self.main_kmeans.predict(new_scaled)[0]
        sub_cluster_id = self.sub_kmeans_models[main_cluster_id].predict(new_scaled)[0]

        cluster_key = f"{main_cluster_id}_{sub_cluster_id}"
        mapped_cluster_id = self.subcluster_map.get(cluster_key, -1)

        seeds_in_cluster = self.df[self.df['main_cluster'] == mapped_cluster_id].copy()

        if country:
            seeds_in_cluster = seeds_in_cluster[seeds_in_cluster['growing_countries'].apply(
                lambda countries: any(country.lower() == c.lower() for c in countries)
            )]

        if seeds_in_cluster.empty:
            return mapped_cluster_id, []

        sub_scaled = self.scaler.transform(seeds_in_cluster[self.X_columns])
        distances = euclidean_distances(sub_scaled, new_scaled).flatten()
        seeds_in_cluster['distance'] = distances
        seeds_in_cluster = seeds_in_cluster.sort_values(by='distance')

        if top_n is not None:
            seeds_in_cluster = seeds_in_cluster.head(top_n)

        return mapped_cluster_id, list(zip(seeds_in_cluster['seed_id'], seeds_in_cluster['distance']))


In [10]:
# Input: 6 features only
new_data = [20, 15, 0.3, 800, 5000, 60]
user_country = ""

recommender = SeedRecommender(
    data_path='data/model_data.pkl',
    scaler_path='models/scaler.joblib',
    main_kmeans_path='models/main_kmeans.joblib',
    sub_kmeans_models_path='models/sub_kmeans_models.joblib'
)

cluster_id, recommendations = recommender.recommend(new_data, country=user_country, top_n=3)

print(f"Main Cluster: {cluster_id}")
print(f"Recommended {len(recommendations)} seed(s):")
for sid, dist in recommendations:
    print(f"Seed ID: {sid}, Distance: {dist:.4f}")


Main Cluster: 7
Recommended 3 seed(s):
Seed ID: 10, Distance: 0.0000
Seed ID: 20621, Distance: 0.0785
Seed ID: 6615, Distance: 0.0785


