# Data creation 

In [1]:
from cleaning_image import Cleaning_image
from data_creation import Data_creation
from models import Models_pollen
import os
import pandas as pd
import numpy as np
import pickle as pick
from warnings import filterwarnings
filterwarnings("ignore")

# The quantity of species supossed of the whole sample
botanical_species=32
seed=42

# Number of visits
visits=4

# Color space parameter
color_space="uv"
data_creation=Data_creation(botanical_species=botanical_species,seed=seed)
cleaning_image=Cleaning_image(botanical_species=botanical_species,seed=seed)

## Initial base

In [None]:
# Dabase of control of sampling
Control=pd.read_excel("Control/Control.xlsx")
Control.sort_values(by="Productor",inplace=True)

# Folder of the images
photographer_foldder=os.listdir("images")


visit_folder=["V"+str(i+1) for i in range(0,visits)]

images=list()
data=list()

# Initial database creation to control the models
for f in photographer_foldder[:2]:
        for v in visit_folder:
            images=[f"images/{f}/{v}/"+i for i in os.listdir(f"images/{f}/{v}/")]+images
            for i in images:
                data.append((int(i[13:15]), int(i[15:17]),int(i[17:19]), int(i[19:21]), int(i[21:23]),
                            Control.drop_duplicates(subset="Nombre")["Nombre"][Control["Productor"]==int(i[15:17])].values[0],
                            i))
df_initial=pd.DataFrame(data,columns=["Photographer","Producer", "Sample", "Visit", "Moment", "Name", "Link"])
df_initial.drop_duplicates(inplace=True)
df_initial.reset_index(drop=True,inplace=True)
df_initial

## Photo cleaning collection

In [4]:
croped_rate=0.1
width_image=250
high_image=150

List_data_image=list()

# Image cleaning
for i in range(1,df_initial["Visit"].max()+1):
        photos_data=data_creation.data_photo_collection(df_initial["Link"][(df_initial["Visit"]==i)], # & (df_initial["Moment"]==np.random.choice([1,2]))],
                                color_space=color_space,sav=False, sav_name=f"data_image_part{i}", croped_rate=croped_rate, 
                                sav_path="Z:/unal/Tesis/Imagenes/Code/data/", width_image=width_image,high_image=high_image)
        print(f"Visit {i}")
        List_data_image= List_data_image + photos_data

Visit 1
Visit 2
Visit 3
Visit 4


## Klustering model for getting of the colors representation

In [None]:
# Clustering color model and center data representation
clustering_model, center_representation=data_creation.color_represent_data(cleaned_photos_array=List_data_image,
                iterations=200, max_iter_kmeans=150, max_iter_minikmeans=150, sub_cluster_rate=1e-6, sub_data_rate=0.3,
                sav=False, sav_name=f"group_model_32_uv",sav_path="Z:/unal/Tesis/Imagenes/Code/models/")

## Proportion data base

In [None]:
photo_dimension=len(color_space)
df_prop=pd.DataFrame(columns=data_creation.columnas())


# Data training
for im in List_data_image:
    labels_photo=clustering_model.predict(im.reshape(-1,photo_dimension))
    data_prop_aux=data_creation.charac_vector(labes_colors=labels_photo, center_color_representation=center_representation,
                                          color_vector_out=np.array([ 96, 136]), cleaning_level=15)
    df_prop=pd.concat([df_prop,pd.DataFrame(np.c_[data_prop_aux].T, columns=data_creation.columnas())])
df_prop.reset_index(drop=True,inplace=True)
df_model=pd.concat([df_initial[["Moment","Producer","Name","Sample", "Visit","Photographer"]].reset_index(drop=True),df_prop],axis=1)
df_model.reset_index(drop=True)

# Model training

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from graphics_results import Graphic_results

model_pollen=Models_pollen(data=df_model,botanical_species=botanical_species,seed=seed)
graphic_results=Graphic_results(botanical_species=botanical_species, seed=seed)
rate_split=2/3

In [None]:
## Data partition
X_train,Y_train,X_val,Y_val,Positions,pos_val = model_pollen.Partition_balanced(cant_visitas=2,rate_split=2/3)

## Label names
Producer=df_initial[["Producer","Name"]].sort_values(by="Producer")["Name"].unique()

plt.bar(Producer,height=np.unique_counts(Y_train["Producer"])[1])
plt.xlabel("Beeckeepers")
plt.ylabel("Count")
plt.xticks(rotation=85, fontsize=10)
plt.title("Frequency distribution train labels data")
plt.show()


plt.bar(Producer,height=np.unique_counts(Y_val["Producer"])[1])
plt.xlabel("Beeckeepers")
plt.ylabel("Count")
plt.xticks(rotation=85, fontsize=10)
plt.title("Frequency distribution val labels data")
plt.show()

In [None]:
# Parameter of the propossed list of models
parameters_list=[{"C":np.arange(0.9,1,0.01), "kernel":["poly"], "degree":[ 6 , 7, 8], "coef0":np.arange(0.1,2,0.1),
                "decision_function_shape":["ovr"]}]

# Lsit of propossed models
model_estimators=[SVC(class_weight="balanced", random_state=seed, gamma="scale", tol=5e-4)
                  ]
# List of training metrics
scoring=["f1_weighted", "roc_auc_ovr"]
model_names=["SVC"]

# Tunning models
results_models=model_pollen.model_tunning(estimator_models=model_estimators, params_dict=parameters_list, score_list=scoring,names_model=model_names,
                                   X_train=X_train.drop(columns="Visit"), Y_train=Y_train.drop(columns="Visit"),cv=5)

In [None]:
# Model evaluation

SVC_model1=results_models[0][3]
visit=4
columns_metrics=["Visit", "balanced_Accuracy", "Recall_weighted", "Precision_weighted", "f1_score_weighted"]
results_metrics=pd.DataFrame(columns=columns_metrics)
for k in range(1,visit+1):
    X_validation=X_val[X_val["Visit"]==k].drop(columns="Visit")
    Y_validation=Y_val[Y_val["Visit"]==k].drop(columns="Visit")
    y_pred=SVC_model1.predict(X_validation)
    acc, recall, preci, f1score, conf_matrix = graphic_results.get_results(y_test=Y_validation,pred_y=y_pred,save_image=True, eje=Producer, 
                                                                    path_image=f"images_result/Confusion_matrix_visit_{k}",
                                                                    format_image="jpg", tittle_image=f"Confusion matrix for the visit {k}")
    results_metrics= pd.concat([results_metrics, pd.DataFrame([[k, acc, recall, preci, f1score]], columns=columns_metrics)],axis=0)
results_metrics.reset_index(drop=True, inplace=True)
results_metrics
