In [3]:
import pandas as pd
import joblib
from sklearn.metrics.pairwise import euclidean_distances







In [4]:
import ast

# Load the data
df = pd.read_csv('new_data.csv')
X_columns = ['temperature', 'soil_temperature', 'soil_moisture', 'precipitation', 'sunshine_duration', 'humid']


# Fix growing_countries if it's stored as a string
if isinstance(df['growing_countries'].iloc[0], str):
    try:
        df['growing_countries'] = df['growing_countries'].apply(ast.literal_eval)
    except:
        print("Error parsing growing_countries — check format.")


  df = pd.read_csv('new_data.csv')


In [5]:
scaler = joblib.load('models/scaler.joblib')
main_kmeans = joblib.load('models/main_kmeans.joblib')
sub_kmeans_models = joblib.load('models/sub_kmeans_models.joblib')

In [6]:
subcluster_map = {
    '0_0': 1, '0_1': 2,
    '1_0': 3, '1_1': 4,
    '2_0': 5, '2_1': 6,
    '3_0': 7, '3_1': 8,
}

def recommend_closest_seeds(new_data, country=None, top_n=None):
    new_scaled = scaler.transform([new_data])

    main_cluster_id = main_kmeans.predict(new_scaled)[0]

    sub_model = sub_kmeans_models[main_cluster_id]
    sub_cluster_id = sub_model.predict(new_scaled)[0]
    full_sub_label = f"{main_cluster_id}_{sub_cluster_id}"

    mapped_cluster_int = subcluster_map.get(full_sub_label, -1)
    seeds_in_subcluster = df[df['sub_cluster'] == full_sub_label].copy()

    if country:
        seeds_in_subcluster = seeds_in_subcluster[seeds_in_subcluster['growing_countries'].apply(
            lambda countries: any(country.lower() == c.lower() for c in countries)
        )]

    if seeds_in_subcluster.empty:
        return main_cluster_id, full_sub_label, mapped_cluster_int, [], []

    sub_scaled = scaler.transform(seeds_in_subcluster[X_columns])
    distances = euclidean_distances(sub_scaled, new_scaled).flatten()

    seeds_in_subcluster['distance'] = distances
    seeds_in_subcluster = seeds_in_subcluster.sort_values(by='distance')

    if top_n is not None:
        seeds_in_subcluster = seeds_in_subcluster.head(top_n)

    return main_cluster_id, full_sub_label, mapped_cluster_int, seeds_in_subcluster['seed_id'].tolist(), seeds_in_subcluster['distance'].tolist()
    # Add the mapped sub-cluster integer to the DataFrame
df['mapped_sub_cluster'] = df['sub_cluster'].map(subcluster_map)

    # Save the updated DataFrame to the CSV
df.to_csv('new_data.csv', index=False)


In [7]:
new_data = [27.5, 20, 0.3, 600, 30000, 70]
user_country = ""


main_cluster, sub_cluster_label, cluster_int, recommended_seeds, distances = recommend_closest_seeds(
    new_data, country=user_country, top_n=None
)

print(f"Main Cluster: {main_cluster}")
print(f"Sub-Cluster: {sub_cluster_label} (Mapped ID: {cluster_int})")
print(f"Recommended {len(recommended_seeds)} seed(s):")

for sid, dist in zip(recommended_seeds, distances):
    print(f"Seed ID: {sid}, Distance: {dist:.4f}")


Main Cluster: 0
Sub-Cluster: 0_1 (Mapped ID: 2)
Recommended 2333 seed(s):
Seed ID: 5001, Distance: 0.0000
Seed ID: 13806, Distance: 0.5960
Seed ID: 12892, Distance: 0.5960
Seed ID: 12602, Distance: 0.5960
Seed ID: 11017, Distance: 0.5993
Seed ID: 7670, Distance: 0.6163
Seed ID: 7661, Distance: 0.6163
Seed ID: 4678, Distance: 0.6172
Seed ID: 2995, Distance: 0.6172
Seed ID: 7642, Distance: 0.6208
Seed ID: 7664, Distance: 0.6208
Seed ID: 7641, Distance: 0.6208
Seed ID: 7671, Distance: 0.6736
Seed ID: 3128, Distance: 0.6769
Seed ID: 4962, Distance: 0.6769
Seed ID: 3116, Distance: 0.6769
Seed ID: 610, Distance: 0.6769
Seed ID: 4079, Distance: 0.6835
Seed ID: 7669, Distance: 0.6988
Seed ID: 3130, Distance: 0.7024
Seed ID: 7644, Distance: 0.7030
Seed ID: 4624, Distance: 0.7096
Seed ID: 34, Distance: 0.7135
Seed ID: 18370, Distance: 0.7135
Seed ID: 18612, Distance: 0.7135
Seed ID: 4179, Distance: 0.7313
Seed ID: 18700, Distance: 0.7461
Seed ID: 7667, Distance: 0.7461
Seed ID: 18001, Distance: 

