# Code of Conduct for Jupyter Notebook Template Usage

## Introduction

Welcome to our Jupyter Notebook environment. To ensure a productive and respectful environment, we have established a few ground rules. Please adhere to this Code of Conduct when using our Jupyter Notebook templates.

## Guidelines

### 1. **Notebook Structure**
   - This notebook has a specifc structure that should be respected and not to be tempered with.

### 2. **Responsible Resource Usage**
   - Use computational resources judiciously.
   - Avoid unnecessary computational tasks that can overload the system.

## Reporting Issues

If you encounter any issues or observe violations of this Code of Conduct, please report them to [jarcau.stefan.cristian@gmail.com](jarcau.stefan.cristian@gmail.com).

## Conclusion

By adhering to these guidelines, we can maintain a healthy, productive, and welcoming environment for all users. Thank you for your cooperation and happy coding!


# Getting the dataset from the database.

In [None]:
import io
import os
import requests
import pandas as pd

url = os.getenv("DATASET_URL")
api = os.getenv("API")

response = requests.get(api + f"?path={url}")

if response.status_code != 200:
    raise Exception(response.content.decode("utf-8"))

df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))

df.head()

# Encode literal columns.

In [None]:
from sklearn.preprocessing import LabelEncoder

encoded_df = df

le = LabelEncoder()

for column in encoded_df.columns:
  if isinstance(encoded_df[column][0], str):
    encoded_df[column] = le.fit_transform(encoded_df[column])

encoded_df.head()

# Training and choosing the best model (You can modify the parameters of the model to fits the dataset best, or let them as they are).

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from metrics import clustering_dispersion_indicator, mean_index_adequacy, variance_ratio_criterion
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, MeanShift, AffinityPropagation
from tqdm import tqdm
from copy import deepcopy

# Define the models you want to compare (You can let the default, but you can change the parameters)
num_clusters = [i for i in range(16) if i > 3]
models = {
    'KMeans': KMeans(n_clusters=5, random_state=42),
    'Agglomerative Clustering': AgglomerativeClustering(n_clusters=5),
    'DBSCAN': DBSCAN(eps= 0.5, min_samples=5),
    'Spectral Clustering': SpectralClustering(n_clusters=5, random_state=42),
    'Mean Shift': MeanShift(),
    "Affinity Propagation": AffinityPropagation(damping=0.5, random_state=42),
}


cluster_results = {}
for num_cluster in num_clusters:
    print(f"Caluculating scores for {num_cluster} of custers.")
    models_deep_copy = deepcopy(models)
    results = []
    for model_name, model in models_deep_copy.items():
      if model.get_params().get("n_clusters") is not None:
          model.n_clusters = num_cluster # Trying the algorithm for different number of clusters.
          
      model.fit(encoded_df)
      labels = model.labels_
      results.append((model_name, {
          "silhouette_score": round(silhouette_score(encoded_df, labels), 2),
          "davies_bouldin_score": round(davies_bouldin_score(encoded_df, labels), 2),
          "calinski_harabasz_score": round(calinski_harabasz_score(encoded_df, labels), 2),
          "cdi": clustering_dispersion_indicator(encoded_df, labels),
          "mia": mean_index_adequacy(encoded_df, labels),
          "vrc": variance_ratio_criterion(encoded_df, labels)
      }))

    print(f"Getting the best results out of all the models.")
    best_results = {k: ("", 0 if k in ["silhouette_score", "calinski_harabasz_score", "vrc"] else 10000000) for k in results[0][1].keys()}
    cluster_result = None
    for result in results:
        for k, v in result[1].items():
            if k in ["silhouette_score", "calinski_harabasz_score"]:
                if v > best_results[k][1]:
                    best_results[k] = (result[0], v)
            elif k in ["davies_bouldin_score", "vrc", "cdi", "mia"]:
              if v < best_results[k][1]:
                  best_results[k] = (result[0], v)
  
    print(f"Getting the best model for {num_cluster} of custers.")
    best_models_cluster = {k: 0 for k in models_deep_copy.keys()}
    for best_result in best_results.values():
        best_models_cluster[best_result[0]] += 1

    cluster_results[num_cluster] = sorted(best_models_cluster.items(), key=lambda item: item[1], reverse=True)[0][0]


print("Getting the best model out of all the clusters.")
best_models = {k: 0 for k in models.keys()}
for cluster_result in cluster_results.values():
  best_models[cluster_result] += 1

sorted_models = sorted(best_models.items(), key=lambda item: item[1], reverse=True)
model_name, number_of_clusters = sorted_models[0][0], sorted_models[0][1]

model_name, number_of_clusters

# Calculating the scores for the best model.

In [None]:
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from metrics import clustering_dispersion_indicator, mean_index_adequacy, variance_ratio_criterion

if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()) and ("number_of_clusters" in locals() or "number_of_clusters" in globals()):
  if models[model_name].get_params().get("n_clusters") is not None:
    models[model_name].n_clusters = number_of_clusters

  models[model_name].fit(encoded_df)

  labels = models[model_name].labels_

  sil_score = silhouette_score(encoded_df, labels)
  chs = calinski_harabasz_score(encoded_df, labels)
  dbs = davies_bouldin_score(encoded_df, labels)
  cdi = clustering_dispersion_indicator(encoded_df, labels)
  mia = mean_index_adequacy(encoded_df, labels)
  vrc = variance_ratio_criterion(encoded_df, labels)

  metrics = {
    "Silhoutte Score": round(sil_score, 2),
    "Calinski Harabasz Score": round(chs, 2),
    "Davies Bouldin Score": round(dbs, 2),
    "Clustering Dispersion Indicator": cdi,
    "Mean Index Adequacy": mia,
    "Variance Ratio Criterion": vrc,
  }

  print(metrics)
else:
  print("Please run all the previouse cell before running this one!")

# Saving the parameters of the best model.

In [None]:
if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()):
  params = { k: v for k, v in models[model_name].get_params().items() if v is not None}
  print(params)
else:
  print("Please run all the previouse cell before running this one!")

# Saving images of the best model.

In [None]:
import shap
import seaborn as sns
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()) and ("labels" in locals() or "labels" in globals()):
    pca = PCA(n_components=2)
    X_r = pca.fit_transform(encoded_df)

    plt.figure(figsize=(12, 8))
    sns.scatterplot(x=X_r[:, 0], y=X_r[:, 1], hue=labels, palette="viridis")
    plt.title("Cluster assignments")

    buf_clusters = BytesIO()
    plt.savefig(buf_clusters, format='png')
    plt.close()

    buf_clusters.seek(0)
    pair_plot_with_clusters = Image.open(buf_clusters)

    corr = df.corr()

    plt.figure(figsize=(10, 7))
    sns.heatmap(corr, annot=True, fmt="d", cmap="Blues")
    plt.title('Correlation Matrix')

    buf_corr = BytesIO()
    plt.savefig(buf_corr, format='png')
    plt.close()

    buf_corr.seek(0)
    corr_matrix = Image.open(buf_corr)

    # Saving SHAP plot for explainability.
    explainer = shap.Explainer(models[model_name].predict, encoded_df)

    shap_values = explainer(encoded_df)

    shap.summary_plot(shap_values, encoded_df, plot_type="bar", show=False)

    buf_shap = BytesIO()
    plt.savefig(buf_shap, format='png')
    buf_shap.seek(0)
    plt.close()

    shap_image = Image.open(buf_shap)
else:
    print("Please run all the previouse cell before running this one!")


# Save the model inside our model repository (Please run this cell only when you are satisfied with the result)

In [None]:
import os
import datetime
import mlflow
import json
import pandas as pd
from mlflow import MlflowClient, MlflowException
from sqlalchemy import create_engine, Column, String, DateTime, Numeric, Integer
from sqlalchemy.orm import sessionmaker, declarative_base

def is_variable_defined(var_name):
    return var_name in locals() or var_name in globals()

if is_variable_defined("model_name") and is_variable_defined("models") and is_variable_defined("params") and is_variable_defined("metrics") and is_variable_defined("corr_matrix") and is_variable_defined("pair_plot_with_clusters") and is_variable_defined("shap_image"):
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
    register_name = f'{os.getenv("MODEL_NAME")}-{os.getenv("USER_ID")}'

    client = MlflowClient()

    condition = None
    try:
        condition = client.get_registered_model(register_name)
    except MlflowException as e:
        print(e)

    if condition is None:
        with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name("default").experiment_id) as run:
          mlflow.sklearn.log_model(models[model_name], "model")
          mlflow.log_metrics(metrics)
          mlflow.log_params(params)
          mlflow.log_image(corr_matrix, "correlation_matrix.png")
          mlflow.log_image(pair_plot_with_clusters, "pair_plot_with_clusters.png")
          mlflow.log_image(shap_image, "shap.png")

        run_id = run.info.run_id

        src_uri = f"runs:/{run_id}/model"

        client.create_registered_model(register_name)
        mv = client.create_model_version(register_name, src_uri, run_id)
        print(f"Name: {mv.name}")
        print(f"Version: {mv.version}")
        print(f"Source: {mv.source}")

        Base = declarative_base()

        class MyTable(Base):
            __tablename__ = 'models'
            model_id = Column(String, primary_key=True)
            created_at = Column(DateTime)
            user_id = Column(String)
            dataset_user = Column(String)
            description = Column(String)
            score = Column(Numeric)
            model_name = Column(String)
            score_count = Column(Integer)
            notebook_type = Column(String)

        csv_data = {
            "column_dtypes": {},
            "column_ranges": {},
            "column_categories": {},
            "column_unique_values": {}
        }

        for column in df.columns:
            if pd.api.types.is_numeric_dtype(df[column]):
                csv_data["column_dtypes"][column] = "numeric"
                csv_data["column_ranges"][column] = (min(df[column]), max(df[column]))
                csv_data["column_categories"][column] = None
                csv_data["column_unique_values"][column] = None
            else:
                if len(df[column].unique()) == len(df):
                    csv_data["column_dtypes"][column] = "unique_identifier"
                    csv_data["column_ranges"][column] = None
                    csv_data["column_categories"][column] = None
                    csv_data["column_unique_values"][column] = len(df[column].unique())
                else:
                    csv_data["column_dtypes"][column] = "categorical"
                    csv_data["column_ranges"][column] = None
                    csv_data["column_categories"][column] = df[column].unique()
                    csv_data["column_unique_values"][column] = None

        engine = create_engine(f'postgresql+psycopg2://{os.getenv("POSTGRES_USER")}:{os.getenv("POSTGRES_PASSWORD")}@{os.getenv("POSTGRES_HOST")}:{os.getenv("POSTGRES_PORT")}/'
                                f'{os.getenv("POSTGRES_DB")}')
        Base.metadata.create_all(engine)

        Session = sessionmaker(bind=engine)
        session = Session()

        with Session() as session:
            new_row = MyTable(model_id=register_name, created_at=datetime.datetime.now(), user_id=os.getenv("USER_ID"),
                             description=json.dumps(csv_data), score=0.0, model_name=os.getenv("MODEL_NAME"), score_count=0, dataset_user=os.getenv("DATASET_USER"), notebook_type="clustering")

            session.add(new_row)

            session.commit()
    else:
        print("You can only add the model once!")
else:
    print("Please run all the previouse cell before running this one!")

# Deleting the notebook. (Be careful when running this cell!)

In [None]:
import os
import requests
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy import create_engine, Column, String, DateTime, Integer

def is_variable_defined(var_name):
    return var_name in locals() or var_name in globals()

if is_variable_defined("model_name") and is_variable_defined("models") and is_variable_defined("params") and is_variable_defined("metrics") and is_variable_defined("corr_matrix") and is_variable_defined("pair_plot_with_clusters") and is_variable_defined("shap_image"):
    Base = declarative_base()
    
    class MyTable(Base):
        __tablename__ = 'notebooks'
        user_id = Column(String)
        created_at = Column(DateTime)
        last_accessed = Column(DateTime)
        notebook_id = Column(String, primary_key=True)
        description = Column(String)
        dataset_user = Column(String)
        dataset_name = Column(String)
        port = Column(Integer)
        notebook_type = Column(String)
    
    
    # Connect to the database
    engine = create_engine(f'postgresql+psycopg2://{os.getenv("POSTGRES_USER")}:{os.getenv("POSTGRES_PASSWORD")}@{os.getenv("POSTGRES_HOST")}:{os.getenv("POSTGRES_PORT")}/{os.getenv("POSTGRES_DB")}')
    Base.metadata.create_all(engine)
    
    # Create a session
    Session = sessionmaker(bind=engine)
    session = Session()
    
    notebook_id = os.getenv('NOTEBOOK_ID')
    
    # Query for the specific entry
    row_count = session.query(MyTable).filter(MyTable.notebook_id == notebook_id).delete(synchronize_session='evaluate')
    
    if row_count > 0:
        session.commit()
    else:
        print("Rows not found")
    
    session.close()
    
    response = requests.get(f"http://{os.getenv('SERVICE_NAME')}:{os.getenv('SERVICE_PORT')}")
    if response.status_code == 200:
    
        response = requests.delete(f"http://{os.getenv('SERVICE_NAME')}:{os.getenv('SERVICE_PORT')}/delete_pod?uid={notebook_id}")
    
        if response.status_code == 200:
            print("Pod deleted successfully!")
else:
    print("Please run all the previouse cell before running this one!")