# Code of Conduct for Jupyter Notebook Template Usage

## Introduction

Welcome to our Jupyter Notebook environment. To ensure a productive and respectful environment, we have established a few ground rules. Please adhere to this Code of Conduct when using our Jupyter Notebook templates.

## Guidelines

### 1. **Notebook Structure**
   - This notebook has a specifc structure that should be respected and not to be tempered with.

### 2. **Responsible Resource Usage**
   - Use computational resources judiciously.
   - Avoid unnecessary computational tasks that can overload the system.

## Reporting Issues

If you encounter any issues or observe violations of this Code of Conduct, please report them to [jarcau.stefan.cristian@gmail.com](jarcau.stefan.cristian@gmail.com).

## Conclusion

By adhering to these guidelines, we can maintain a healthy, productive, and welcoming environment for all users. Thank you for your cooperation and happy coding!


# Getting the dataset from the database.

In [None]:
import io
import os
import requests
import pandas as pd

url = os.getenv("DATASET_URL")
api = os.getenv("API")

response = requests.get(api + f"?path={url}")

if response.status_code != 200:
    raise Exception(response.content.decode("utf-8"))

df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))

df.head()

# See what is the target column for training the model.

In [None]:
import os

os.getenv("TARGET_COLUMN").strip().replace("\n", "")

# Encode literal columns.

In [None]:
from sklearn.preprocessing import LabelEncoder

encoded_df = df

le = LabelEncoder()

for column in encoded_df.columns:
  if isinstance(encoded_df[column][0], str):
    encoded_df[column] = le.fit_transform(encoded_df[column])

encoded_df.head()

# Split the encoded_df in train and test subsets.

In [None]:
import os
from sklearn.model_selection import train_test_split

train, test = train_test_split(encoded_df, test_size=0.1)

X_train, y_train = train[[column for column in train.columns if column != os.getenv("TARGET_COLUMN").strip().replace("\n", "")]], train[os.getenv("TARGET_COLUMN").strip().replace("\n", "")]
X_test, y_test = test[[column for column in test.columns if column != os.getenv("TARGET_COLUMN").strip().replace("\n", "")]], test[os.getenv("TARGET_COLUMN").strip().replace("\n", "")]

# Training and choosing the best model (You can modify the parameters of the model to fits the dataset best, or let them as they are).

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tqdm import tqdm
import numpy as np

# Define the models you want to compare
models = {
    'SVR': SVR(random_state=42),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=len(np.unique(encoded_df[os.getenv("TARGET_COLUMN").strip().replace("\n", "")])), random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    "XGB Regressor": XGBRegressor(tree_method="hist", eval_metric='logloss', random_state=42),
    "LGBM Regressor": LGBMRegressor(n_estimators=200, random_state=42) # Change the objective to multiclass if you see more then one class in the target column
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)  # You can modify the n_splits, to train the model better

def cross_validate_model(model, X, y, cv):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return scores.mean()

cv_results = {model_name: cross_validate_model(model, X_train, y_train, cv_strategy) for model_name, model in tqdm(models.items())}

max = 0
model_name = None

for mn, score in cv_results.items():
  if score > max:
    model_name = mn
    max = score

models[model_name].fit(X_train, y_train)

# Calculating the scores for the best model.

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()):
  predictions = models[model_name].predict(X_test)

  acc_score = accuracy_score(y_test, predictions)
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mean_squared_error(y_test, predictions))

  metrics = {
      "Mean Squared Error": round(mse, 2),
      "Mean Absolute Error": round(mae, 2),
      "Root Mean Squared Error": round(rmse, 2),
      "Accuracy Score": round(acc_score, 2)
  }
  
  print(metrics)
else:
    print("Please run all the previouse cell before running this one!")

# Saving the parameters of the best model.

In [None]:
if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()):
  params = None
  if model_name == "XGB Classifier":
    params = { k: v for k, v in models[model_name].get_xgb_params().items() if v is not None}
  else:
    params = { k: v for k, v in models[model_name].get_params().items() if v is not None}
    
  print(params)
else:
    print("Please run all the previouse cell before running this one!")

# Saving images of the best model.

In [None]:
import shap
import numpy as np
import seaborn as sns
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

if ("model_name" in locals() or "model_name" in globals()) and ("models" in locals() or "models" in globals()) and ("y_test" in locals() or "y_test" in globals()):
    predictions = models[model_name].predict(X_test)
    cm = confusion_matrix(y_test, predictions)

    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('True Label')

    buf_conf = BytesIO()
    plt.savefig(buf_conf, format='png')
    plt.close()

    buf_conf.seek(0)
    conf_matrix = Image.open(buf_conf)

    corr = df.corr()

    plt.figure(figsize=(10, 7))
    sns.heatmap(corr, annot=True, fmt="d", cmap="Blues")
    plt.title('Correlation Matrix')

    buf_corr = BytesIO()
    plt.savefig(buf_corr, format='png')
    plt.close()

    buf_corr.seek(0)
    corr_matrix = Image.open(buf_corr)

    # Saving SHAP plot for explainability.
    explainer = shap.Explainer(models[model_name].predict, X_train)

    shap_values = explainer(X_train)

    shap.summary_plot(shap_values, X_train, plot_type="bar", show=False)

    buf_shap = BytesIO()
    plt.savefig(buf_shap, format='png')
    buf_shap.seek(0)
    plt.close()

    shap_image = Image.open(buf_shap)

    print("Finished Saving Images!")
else:
    print("Please run all the previouse cell before running this one!")


# Displaying Saved Images

In [None]:
from IPython.display import display

display(conf_matrix)
display(corr_matrix)
display(shap_image)

# Save the model inside our model repository (Please run this cell only when you are satisfied with the result)

In [None]:
import os
import datetime
import mlflow
import json
import pandas as pd
from mlflow import MlflowClient, MlflowException
from sqlalchemy import create_engine, Column, String, DateTime, Numeric, Integer
from sqlalchemy.orm import sessionmaker, declarative_base

def is_variable_defined(var_name):
    return var_name in locals() or var_name in globals()

os.environ['MLFLOW_TRACKING_USERNAME'] = str(os.getenv("MLFLOW_TRACKING_USERNAME")).strip().replace("\n", "")
os.environ['MLFLOW_TRACKING_PASSWORD'] = str(os.getenv("MLFLOW_TRACKING_PASSWORD")).strip().replace("\n", "")
os.environ['AWS_ACCESS_KEY_ID'] = str(os.getenv("AWS_ACCESS_KEY_ID")).strip().replace("\n", "")
os.environ['AWS_SECRET_ACCESS_KEY'] = str(os.getenv("AWS_SECRET_ACCESS_KEY")).strip().replace("\n", "")
os.environ['MLFLOW_S3_ENDPOINT_URL'] = os.getenv("MLFLOW_S3_ENDPOINT_URL")
os.environ['MLFLOW_HTTP_REQUEST_TIMEOUT'] = "1000"

if is_variable_defined("model_name") and is_variable_defined("models") and is_variable_defined("params") and is_variable_defined("metrics") and is_variable_defined("corr_matrix") and is_variable_defined("conf_matrix") and is_variable_defined("shap_image"):
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
    model_name = os.getenv("MODEL_NAME").strip().replace("\n", "")
    user_id = os.getenv("USER_ID").strip().replace("\n", "")
    register_name = f'{model_name}-{user_id}'

    client = MlflowClient()

    condition = None
    try:
        condition = client.get_registered_model(register_name)
    except MlflowException as e:
        print(e)

    if condition is None:
        with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name("default").experiment_id) as run:
          mlflow.sklearn.log_model(models[model_name], "model")
          mlflow.log_metrics(metrics)
          mlflow.log_params(params)    
          mlflow.log_image(corr_matrix, "correlation_matrix.png")
          mlflow.log_image(conf_matrix, "confusion_matrix.png")
          mlflow.log_image(shap_image, "shap.png")

        run_id = run.info.run_id

        src_uri = f"runs:/{run_id}/model"

        description = ""
        for k, v in encoded_df.dtypes.items():
          description += f"{k}:{v}; "       

        client.create_registered_model(register_name, description=f"This model {os.getenv("MODEL_NAME")} was trained on a dataset with this description having this columns: {description}.")
        mv = client.create_model_version(register_name, src_uri, run_id)
        print(f"Name: {mv.name}")
        print(f"Version: {mv.version}")
        print(f"Source: {mv.source}")

        Base = declarative_base()

        class MyTable(Base):
            __tablename__ = 'models'
            model_id = Column(String, primary_key=True)
            created_at = Column(DateTime)
            user_id = Column(String)
            dataset_user = Column(String)
            description = Column(String)
            score = Column(Numeric)
            model_name = Column(String)
            score_count = Column(Integer)
            notebook_type = Column(String)

        csv_data = {
            "column_dtypes": {},
            "column_ranges": {},
            "column_categories": {},
            "column_unique_values": {}
        }

        for column in df.columns:
            if pd.api.types.is_numeric_dtype(df[column]):
                csv_data["column_dtypes"][column] = "numeric"
                csv_data["column_ranges"][column] = (min(df[column]), max(df[column]))
                csv_data["column_categories"][column] = None
                csv_data["column_unique_values"][column] = None
            else:
                if len(df[column].unique()) == len(df):
                    csv_data["column_dtypes"][column] = "unique_identifier"
                    csv_data["column_ranges"][column] = None
                    csv_data["column_categories"][column] = None
                    csv_data["column_unique_values"][column] = len(df[column].unique())
                else:
                    csv_data["column_dtypes"][column] = "categorical"
                    csv_data["column_ranges"][column] = None
                    csv_data["column_categories"][column] = df[column].unique()
                    csv_data["column_unique_values"][column] = None

        postgres_username = os.getenv("POSTGRES_USER").strip().replace("\n", "")
        postgres_password = os.getenv("POSTGRES_PASSWORD").strip().replace("\n", "")
        postgres_host = os.getenv("POSTGRES_HOST").strip().replace("\n", "")
        postgres_port = os.getenv("POSTGRES_PORT").strip().replace("\n", "")
        postgres_db = os.getenv("POSTGRES_DB").strip().replace("\n", "")

        engine = create_engine(f'postgresql+psycopg2://{postgres_username}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}')
        Base.metadata.create_all(engine)

        Session = sessionmaker(bind=engine)
        session = Session()

        with Session() as session:
            new_row = MyTable(model_id=register_name, created_at=datetime.datetime.now(), user_id=user_id,
                             description=json.dumps(csv_data), score=0.0, model_name=model_name, score_count=0, dataset_user=os.getenv("DATASET_USER"), notebook_type="regression")

            session.add(new_row)

            session.commit()
    else:
        print("You can only add the model once!")
else:
    print("Please run all the previouse cell before running this one!")

# Deleting the notebook. (Be careful when running this cell!)

In [None]:
import os
import requests
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy import create_engine, Column, String, DateTime, Integer

def is_variable_defined(var_name):
    return var_name in locals() or var_name in globals()

if is_variable_defined("model_name") and is_variable_defined("models") and is_variable_defined("params") and is_variable_defined("metrics") and is_variable_defined("corr_matrix") and is_variable_defined("conf_matrix") and is_variable_defined("shap_image"):
    Base = declarative_base()
    
    class MyTable(Base):
        __tablename__ = 'notebooks'
        user_id = Column(String)
        created_at = Column(DateTime)
        last_accessed = Column(DateTime)
        notebook_id = Column(String, primary_key=True)
        description = Column(String)
        dataset_user = Column(String)
        dataset_name = Column(String)
        port = Column(Integer)
        notebook_type = Column(String)
    
    
    # Connect to the database
    postgres_username = os.getenv("POSTGRES_USER").strip().replace("\n", "")
    postgres_password = os.getenv("POSTGRES_PASSWORD").strip().replace("\n", "")
    postgres_host = os.getenv("POSTGRES_HOST").strip().replace("\n", "")
    postgres_port = os.getenv("POSTGRES_PORT").strip().replace("\n", "")
    postgres_db = os.getenv("POSTGRES_DB").strip().replace("\n", "")

    engine = create_engine(f'postgresql+psycopg2://{postgres_username}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}')
    Base.metadata.create_all(engine)
    
    # Create a session
    Session = sessionmaker(bind=engine)
    session = Session()
    
    notebook_id = os.getenv('NOTEBOOK_ID')
    
    # Query for the specific entry
    row_count = session.query(MyTable).filter(MyTable.notebook_id == notebook_id).delete(synchronize_session='evaluate')
    
    if row_count > 0:
        session.commit()
    else:
        print("Rows not found")
    
    session.close()
    
    response = requests.get(f"http://{os.getenv('SERVICE_NAME')}:{os.getenv('SERVICE_PORT')}")
    if response.status_code == 200:
    
        response = requests.delete(f"http://{os.getenv('SERVICE_NAME')}:{os.getenv('SERVICE_PORT')}/delete_pod?uid={notebook_id}")
    
        if response.status_code == 200:
            print("Pod deleted successfully!")
else:
    print("Please run all the previouse cell before running this one!")