# Assessment 02

Eurico Martins (nº. 8794, a8794@alunos.ipca.pt)
Gutelvam Fernandes (nº. 33791,  a33791@alunos.ipca.pt)

In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [15]:
from enum import Enum

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
def load_data_and_inspect(csv_path:str, sep:str=';') -> pd.DataFrame:
	print(f"Loading data from ({csv_path})\n")

	df = pd.read_csv(csv_path, header=0, sep=sep)
	print(f'data shape: {df.shape}')

	summ = pd.DataFrame(df.dtypes, columns=['Data Type'])
	summ['Missing#'] = df.isna().sum()
	summ['Missing%'] = (df.isna().sum())/len(df)
	summ['Dups'] = df.duplicated().sum()
	summ['Uniques'] = df.nunique().values
	summ['Count'] = df.count().values
	
	desc = pd.DataFrame(df.describe(include='all').transpose())

	summ['Min'] = desc['min'].values
	summ['Max'] = desc['max'].values
	summ['Average'] = desc['mean'].values
	summ['Standard Deviation'] = desc['std'].values
	summ['First Value'] = df.loc[0].values
	summ['Second Value'] = df.loc[1].values
	summ['Third Value'] = df.loc[2].values

	display(summ)

	return df

## Data Preparation

## PipeLine Object Definition

In [12]:
class LearningType(Enum):
    SUPERVISED = "supervised"
    UNSUPERVISED = "unsupervised"


class TargetMetric(Enum):
    RMSE = "rmse"
    MSE = "mse"
    MAE = "mae"
    R2 = "r2"
    ACCURACY = "accuracy"
    SILHOUETTE = "silhouette"
    PRECISION = "precision"
    RECALL = "recall"

In [13]:
class AutoMLPipeline:
	def __init__(self, learning_type: LearningType, target_metric: TargetMetric):
		if not isinstance(learning_type, LearningType):
			raise ValueError("learning_type must be an instance of LearningType Enum.")
		if not isinstance(target_metric, TargetMetric):
				raise ValueError("target_metric must be an instance of TargetMetric Enum.")

		# Validate that the metric is appropriate for the learning type
		valid_metrics = {
				LearningType.SUPERVISED: {
						TargetMetric.RMSE,
						TargetMetric.MSE,
						TargetMetric.MAE,
						TargetMetric.R2,
						TargetMetric.ACCURACY,
						TargetMetric.PRECISION,
						TargetMetric.RECALL,
				},
				LearningType.UNSUPERVISED: {
						TargetMetric.SILHOUETTE,
				},
		}

		if target_metric not in valid_metrics[learning_type]:
				raise ValueError(
						f"Invalid target metric '{target_metric.name}' for learning type '{learning_type.name}'. "
						f"Valid metrics for {learning_type.name} are: "
						f"{', '.join(metric.name for metric in valid_metrics[learning_type])}."
				)
	
		self.learning_type = learning_type
		self.target_metric = target_metric
		self.best_model = None
		self.results = {}

		def _create_pipeline(self, model, numeric_features, categorical_features):
			# Define transformers for numeric and categorical data
			numeric_transformer = Pipeline(steps=[
					("scaler", StandardScaler())
			])
			categorical_transformer = Pipeline(steps=[
					("onehot", OneHotEncoder(handle_unknown="ignore"))
			])
			
			# Create a preprocessor for column transformations
			preprocessor = ColumnTransformer(
					transformers=[
							("num", numeric_transformer, numeric_features),
							("cat", categorical_transformer, categorical_features),
					]
			)
			
			# Create the full pipeline
			pipeline = Pipeline(steps=[
					("preprocessor", preprocessor),
					("model", model)
			])
			
			return pipeline



## Main

DataBase loading and metadata display

In [10]:
airbnb_ds_path:str = "../data/learning/airbnb_lisbon/airbnb_lisbon_1480_2017-07-27.csv"
airbnb_data_frame: pd.DataFrame = load_data_and_inspect(airbnb_ds_path, sep=',')

Loading data from (../data/learning/airbnb_lisbon/airbnb_lisbon_1480_2017-07-27.csv)

data shape: (13578, 20)


Unnamed: 0,Data Type,Missing#,Missing%,Dups,Uniques,Count,Min,Max,Average,Standard Deviation,First Value,Second Value,Third Value
room_id,int64,0,0.0,0,13578,13578,6499.0,20091030.0,11286530.758285,6331112.119502,14708916,7465447,11058290
survey_id,int64,0,0.0,0,1,13578,1480.0,1480.0,1480.0,0.0,1480,1480,1480
host_id,int64,0,0.0,0,6457,13578,14455.0,142658216.0,40317799.948814,41202576.060598,91501272,704061,1379661
room_type,object,0,0.0,0,3,13578,,,,,Shared room,Shared room,Shared room
country,float64,13578,1.0,0,0,0,,,,,,,
city,object,0,0.0,0,1,13578,,,,,Lisbon,Lisbon,Lisbon
borough,float64,13578,1.0,0,0,0,,,,,,,
neighborhood,object,0,0.0,0,24,13578,,,,,Santo António,Avenidas Novas,Santa Maria Maior
reviews,int64,0,0.0,0,276,13578,0.0,448.0,29.507512,43.657402,19,4,38
overall_satisfaction,float64,0,0.0,0,9,13578,0.0,5.0,3.242046,2.151046,4.5,3.5,4.5
