# Automated pipeline

## Creating the models dictionary

My goal is to create a easy and flexible solution ready for future changes, so i will simplify everything by using a dictionary of models

Marta has her notebook where she trains and analyzes the models, so i will use her models to create the dictionary, i will use Pickle for saving the models and importing them so that the mangement of the two is easier

In [1]:
# import every model using pickle into a dictionary, by importing every pickle in the models folder
import pandas as pd
import pickle,os

model_path = "../history/current_models"
models = {}

# Load the models from the models folder

for file in os.listdir(model_path):
    if file.endswith(".pkl"):
        model_name = file.split(".")[0]
        model = pickle.load(open(f"{model_path}/{file}", "rb"))
        models[model_name] = model

for model_name, model in models.items():
    print(f"Model: {model_name}")
    print(f"Model: {model}")
    

Model: linear_regression_diamonds
Model: {'model': LinearRegression(), 'model_name': 'linear_regression_diamonds', 'preferred_preprocessing': 0}
Model: xgboost_diamonds
Model: {'model': XGBRegressor(alpha=0.032805312311200055, base_score=None, booster=None,
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.00013669441189434554,
             learning_rate=0.09548820588899541, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=964, n_jobs=None, ...), 'model_name': 'xgboost_diamonds', 'preferred

## Preprocessing the new data

We need to preprocess the new data in different ways, based on the model we are training

I changed a bit the notebook made by Marta by adding the creation of dictionaries which contain the models and their preferences about data, for example either if it likes non numerical values or not

In [2]:
from copy import deepcopy
def preprocess_input_for_tree_models(input_data):
    diamonds_processed = deepcopy(input_data)
    # remove every line with a missing value
    diamonds_processed = diamonds_processed[(diamonds_processed.x * diamonds_processed.y * diamonds_processed.z != 0) & (diamonds_processed.price > 0)]
    diamonds_processed['cut'] = pd.Categorical(diamonds_processed['cut'], categories=['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'], ordered=True)
    diamonds_processed['color'] = pd.Categorical(diamonds_processed['color'], categories=['D', 'E', 'F', 'G', 'H', 'I', 'J'], ordered=True)
    diamonds_processed['clarity'] = pd.Categorical(diamonds_processed['clarity'], categories=['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1'], ordered=True)
    return diamonds_processed

def preprocess_input_for_linear_models(input_data):
    diamonds_processed = deepcopy(input_data)
    diamonds_processed = diamonds_processed[(diamonds_processed.x * diamonds_processed.y * diamonds_processed.z != 0) & (diamonds_processed.price > 0)]
    diamonds_processed = diamonds_processed.dropna()
    diamonds_processed.drop(columns=['depth', 'table', 'y', 'z'])
    diamonds_processed = pd.get_dummies(diamonds_processed, columns=['cut', 'color', 'clarity'])
    return diamonds_processed
        

## Merge the new dataset with the old one

In [3]:
# merge new csv file with old one
old_csv = "../data/diamonds_1.csv"
def merge_csv_files(new_csv):
    old_data = pd.read_csv(old_csv)
    new_data = pd.read_csv(new_csv)
    merged_data = pd.concat([old_data, new_data])
    merged_data.to_csv(old_csv, index=False)
    # move the new csv file to a different folder
    new_data.to_csv(f"../data/trash/{new_csv.split('/')[-1]}", index=False)
    os.remove(new_csv)
    return merged_data

## Fit the current models with the new data 

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
import utils


def fit_test_models(new_data):
    print("Fitting models")
    x_train_0, x_test_0, y_train_0, y_test_0 = train_test_split(new_data[0].drop(columns=['price']), new_data[0]['price'], test_size=0.2, random_state=0)
    x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(new_data[1].drop(columns=['price']), new_data[1]['price'], test_size=0.2, random_state=0)

    for model_name,model in models.items():
        if(model["preferred_preprocessing"] == 0):
            model["model"].fit(x_train_0, np.log(y_train_0))
            utils.save_model(model["model"], model_name)
        elif(model["preferred_preprocessing"] == 1):
            model["model"].fit(x_train_1, y_train_1)
            utils.save_model(model["model"], model_name,1)
        else:
            print(model["preferred_preprocessing"])
            raise ValueError("Invalid preprocessing method")
    test_models((y_test_0, y_test_1, x_test_0, x_test_1))

def test_models(data):
    y_val_0, y_val_1,x_val_0,x_val_1 = data
    results = {}
    for model_name,model in models.items():
        if(model["preferred_preprocessing"] == 0):
            pred = np.exp(model["model"].predict(x_val_0))
            utils.save_scores(y_val_0, pred, model_name)
        elif(model["preferred_preprocessing"] == 1):
            pred = model["model"].predict(x_val_1)
            utils.save_scores(y_val_1, pred, model_name)
        else:
            print(model["preferred_preprocessing"])
            raise ValueError("Invalid preprocessing method")
    return results

## Scanning folder for new data

In [5]:
!pip install watchdog



In [6]:
# new_data_preprocessed = []
# file_path = "../data/new_data/diamonds_2.csv"

# new_data = pd.read_csv(file_path)
# new_data_preprocessed_tree = preprocess_input_for_tree_models(new_data)
# new_data_preprocessed_linear = preprocess_input_for_linear_models(new_data)
# new_data_preprocessed.append(new_data_preprocessed_linear)
# new_data_preprocessed.append(new_data_preprocessed_tree)


In [7]:
# new_data_preprocessed[1]

In [8]:
# fit_test_models(new_data_preprocessed)

In [9]:
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

class FileEventHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.is_directory:
            return
        if event.src_path.endswith('.csv'):
            print(f"New file detected: {event.src_path}")
            process_new_file(event.src_path)

def process_new_file(file_path):
    new_data_preprocessed = []
    # Add your data processing and model training code here
    print(f"Processing file: {file_path}")
    new_data = merge_csv_files(file_path)
    new_data_preprocessed_tree = preprocess_input_for_tree_models(new_data)
    new_data_preprocessed_linear = preprocess_input_for_linear_models(new_data)
    new_data_preprocessed.append(new_data_preprocessed_linear)
    new_data_preprocessed.append(new_data_preprocessed_tree)
    
    #print('new data 0', new_data_preprocessed[0].head())
    fit_test_models(new_data_preprocessed)


if __name__ == "__main__":
    path = "../data/new_data"
    event_handler = FileEventHandler()
    observer = Observer()
    observer.schedule(event_handler, path, recursive=False)
    observer.start()
    
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()



New file detected: /Users/leonardovaia/Documents/Job_interview/xtream-ai-assignment-developer/data/new_data/diamonds_3.csv
Processing file: /Users/leonardovaia/Documents/Job_interview/xtream-ai-assignment-developer/data/new_data/diamonds_3.csv
Fitting models
linear_regression_diamonds
{'r2_score': 0.9400259912530078, 'mae': 394.5549991908956}
xgboost_diamonds
{'r2_score': 0.9623680284898042, 'mae': 350.48988344828285}
New file detected: /Users/leonardovaia/Documents/Job_interview/xtream-ai-assignment-developer/data/new_data/diamonds_4.csv
Processing file: /Users/leonardovaia/Documents/Job_interview/xtream-ai-assignment-developer/data/new_data/diamonds_4.csv
Fitting models
linear_regression_diamonds
{'r2_score': 0.9637801650738812, 'mae': 404.80655621850013}
xgboost_diamonds
{'r2_score': 0.9684146102306985, 'mae': 350.7339361809073}
New file detected: /Users/leonardovaia/Documents/Job_interview/xtream-ai-assignment-developer/data/new_data/diamonds_5.csv
Processing file: /Users/leonardov