In [22]:
import sys
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, FunctionTransformer
from category_encoders.binary import BinaryEncoder
from sklearn.ensemble import GradientBoostingClassifier
import pytest
import ipytest
ipytest.autoconfig()

In [4]:
sys.path.append(os.path.abspath(r'D:\MNA\MLOpsEquipo19\refactoring'))
from config.load_params import load_params

In [9]:
data_path = r'D:\MNA\MLOpsEquipo19\data\raw\ObesityDataSet_raw_and_data_sinthetic.csv'
#data_path = r'D:\MNA\MLOpsEquipo19\data\processed\data.csv'
data = pd.read_csv(data_path)

# Quick check in the data


In [10]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [11]:
data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [12]:
data.describe(include='object')

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
count,2111,2111,2111,2111,2111,2111,2111,2111,2111
unique,2,2,2,4,2,2,4,5,7
top,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_I
freq,1068,1726,1866,1765,2067,2015,1401,1580,351


In [13]:
data.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

# Setting up the classes to build a simple model

In [54]:
class SimplePipeline:
    def __init__(self,data_path,target):
        self.data_path  = data_path
        self.target = target
        self.model = None
        self.X_train, self.X_test, self.X_val, self.y_train, self.y_test, self.y_val = [None] * 6
        self.load_data()

    def load_data(self):
        self.data = pd.read_csv(self.data_path)
        self.feature_names = self.data.columns.tolist()
        self.adjust_data_types()
        self.split_data()
    
    def adjust_data_types(self):
        self.data_ajusted = self.data.copy()

    def split_data(self, test_size=0.2, val_size=0.1, random_state=1):
        X = self.data_ajusted.drop(self.target, axis=1)
        y = self.data_ajusted[self.target]

        # División en entrenamiento y prueba
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # División en entrenamiento y validación
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.X_train, self.y_train, random_state=random_state, test_size=val_size)


    def create_transformer(self):
        cat = ['Gender', 'family_history_with_overweight', 'FAVC',  'CAEC',	'SMOKE', 'SCC',	'CALC',	'MTRANS']	
        cat_pipeline = Pipeline(steps=[
            ('binary', BinaryEncoder(handle_unknown='ignore'))
        ])
        self.data_transformer = ColumnTransformer(transformers=[
            ('cat', cat_pipeline, cat),
        ])

    def fit_transform_data(self):
        # Ajustar y transformar los conjuntos de datos
        self.create_transformer()
        self.X_train_preprocessed = self.data_transformer.fit_transform(self.X_train)
        self.X_val_preprocessed = self.data_transformer.transform(self.X_val)
        self.X_test_preprocessed = self.data_transformer.transform(self.X_test)
   
    def train(self, algorithm=GradientBoostingClassifier):
        
        self.model = algorithm(learning_rate=0.1, 
                               max_depth=3, 
                               n_estimators=100)
        self.model.fit(self.X_train_preprocessed, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test_preprocessed, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_data()
        self.fit_transform_data()
        self.train()

# Running the pipeline


In [31]:
data_path = r'D:\MNA\MLOpsEquipo19\data\raw\ObesityDataSet_raw_and_data_sinthetic.csv'
#data_path = r'D:\MNA\MLOpsEquipo19\data\processed\data.csv'

In [32]:
pipeline = SimplePipeline(data_path,'NObeyesdad')
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.6004728132387707


# Testing

In [75]:
data_schema = {
    'Gender': {
        'dtype': object,
        'allowed_values': ['Female', 'Male']
    },
    'Age': {
        'range': {
            'min': 14.0,
            'max': 61.0
        },
        'dtype': float,
    },
    'Height': {
        'range': {
            'min': 1.45,
            'max': 1.98
        },
        'dtype': float,
    },
    'Weight': {
        'range': {
            'min': 39.0,
            'max': 173.0
        },
        'dtype': float,
    },
    'family_history_with_overweight': {
        'dtype': object,
        'allowed_values': ['yes', 'no']
    },
    'FAVC': {
        'dtype': object,
        'allowed_values': ['yes', 'no']
    },
    'FCVC': {
        'range': {
            'min': 1.0,
            'max': 3.0
        },
        'dtype': float,
    },
    'NCP': {
        'range': {
            'min': 1.0,
            'max': 4.0
        },
        'dtype': float,
    },
    'CAEC': {
        'dtype': object,
        'allowed_values': ['Sometimes', 'Frequently', 'Always', 'no']
    },
    'SMOKE': {
        'dtype': object,
        'allowed_values': ['yes', 'no']
    },
    'CH2O': {
        'range': {
            'min': 1.0,
            'max': 3.0
        },
        'dtype': float,
    },
    'SCC': {
        'dtype': object,
        'allowed_values': ['yes', 'no']
    },
    'FAF': {
        'range': {
            'min': 0.0,
            'max': 3.0
        },
        'dtype': float,
    },
    'TUE': {
        'range': {
            'min': 0.0,
            'max': 2.0
        },
        'dtype': float,
    },
    'CALC': {
        'dtype': object,
        'allowed_values': ['no', 'Sometimes', 'Frequently', 'Always']
    },
    'MTRANS': {
        'dtype': object,
        'allowed_values': ['Public_Transportation', 'Walking', 'Automobile', 'Motorbike', 'Bike']
    },
    'NObeyesdad': {
        'dtype': object,
        'allowed_values': [
            'Normal_Weight', 
            'Overweight_Level_I',
            'Overweight_Level_II',
            'Obesity_Type_I', 
            'Insufficient_Weight', 
            'Obesity_Type_II',
            'Obesity_Type_III'
            ]
    }
}


In [51]:
@pytest.fixture
def pipeline():
    pl = SimplePipeline(data_path,'NObeyesdad')
    pl.run_pipeline()
    return pl

# Creating the tests


In [76]:
%%ipytest

def test_input_data_ranges(pipeline):
    # Getting the maximum and minimum values for each column
    max_values = pipeline.data.max()
    min_values = pipeline.data.min()
    
    # Ensuring that the maximum and minimum values fall into the expected range
    for feature in pipeline.feature_names:
        print(feature)
        if 'range' in data_schema[feature]:  # Check if feature has a range defined
            assert max_values[feature] <= data_schema[feature]['range']['max']
            assert min_values[feature] >= data_schema[feature]['range']['min']

def test_input_data_allowed_values(pipeline):
    for feature in pipeline.feature_names:
        # Check allowed values for categorical features
        if 'allowed_values' in data_schema[feature]:  # Check if feature has allowed values defined
            assert all(value in data_schema[feature]['allowed_values'] for value in pipeline.data[feature].unique())

def test_input_data_types(pipeline):
    # Getting the data types from each column
    data_types = pipeline.data.dtypes
    
    # Testing compatibility between data types
    for feature in pipeline.feature_names:
        assert data_types[feature] == data_schema[feature]['dtype']


[32m.[0m[32m.[0m[32m.[0m[32m                                                                                          [100%][0m
[32m[32m[1m3 passed[0m[32m in 2.89s[0m[0m


# Let´s make the test fail

In [82]:
%%ipytest

def test_input_data_ranges(pipeline):
    # Getting the maximum and minimum values for each column
    max_values = pipeline.data.max()
    min_values = pipeline.data.min()
    
    # We change the values so the new ones are not the same as those we have already in the schema
    for feature in pipeline.feature_names:
        print(feature)
        if 'range' in data_schema[feature]:  # Check if feature has a range defined
            assert max_values[feature] < 0  # This will make the test fail
            assert min_values[feature] > 1000  # This one as well

def test_input_data_allowed_values(pipeline):
    for feature in pipeline.feature_names:
        # Check allowed values for categorical features
        if 'allowed_values' not in data_schema[feature]:  # Check if feature has allowed values defined
            assert all(value in data_schema[feature]['allowed_values'] for value in pipeline.data[feature].unique())

def test_input_data_types(pipeline):
    # Getting the data types from each column
    data_types = pipeline.data.dtypes
    
    # Testing compatibility between data types
    for feature in pipeline.feature_names:
        assert data_types[feature] != data_schema[feature]['dtype']

[31mF[0m[31mF[0m[31mF[0m[31m                                                                                          [100%][0m
[31m[1m_____________________________________ test_input_data_ranges ______________________________________[0m

pipeline = <__main__.SimplePipeline object at 0x000002727F6D3DF0>

    [0m[94mdef[39;49;00m [92mtest_input_data_ranges[39;49;00m(pipeline):[90m[39;49;00m
        [90m# Getting the maximum and minimum values for each column[39;49;00m[90m[39;49;00m
        max_values = pipeline.data.max()[90m[39;49;00m
        min_values = pipeline.data.min()[90m[39;49;00m
    [90m[39;49;00m
        [90m# We change the values so the new ones are not the same as those we have already in the schema[39;49;00m[90m[39;49;00m
        [94mfor[39;49;00m feature [95min[39;49;00m pipeline.feature_names:[90m[39;49;00m
            [96mprint[39;49;00m(feature)[90m[39;49;00m
            [94mif[39;49;00m [33m'[39;49;00m[33mrange[39;49;00