# Initial Setup

In [2]:
# This is needed only for the purpose of the notebook
!pip install ipytest

Collecting ipytest
  Downloading ipytest-0.14.2-py3-none-any.whl.metadata (17 kB)
Downloading ipytest-0.14.2-py3-none-any.whl (18 kB)
Installing collected packages: ipytest
Successfully installed ipytest-0.14.2



[notice] A new release of pip is available: 24.2 -> 24.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Importing required libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets

import pytest
import ipytest
ipytest.autoconfig()

In [4]:
# Getting the data
iris = datasets.load_iris()

In [5]:
# Simple setup in the data
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df['target'] = iris.target

### Quick check in the data

In [6]:
# Quick overview of the first 5 rows in the data
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
# Simple counts in our target variable
iris_df['target'].value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

In [8]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


### Setting up the classes to build a simple model

In [9]:
class SimplePipeline:
    def __init__(self):
        self.frame = None
        # Each value is None when we instantiate the class
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Loading the dataset, and make the train, test, split."""
        dataset = datasets.load_iris()
        
        # Removing the units (cm) from the headers
        self.feature_names = [fn[:-5] for fn in dataset.feature_names]
        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)
        self.frame['target'] = dataset.target
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)
        
    def train(self, algorithm=LogisticRegression):
        
        self.model = algorithm(solver='lbfgs', multi_class='auto')
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_dataset()
        self.train()

### Running the pipeline

In [16]:
pipeline = SimplePipeline()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.9693877551020408




# Testing

In [15]:
# Defining the schema
iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0,
            'max': 8.0
        },
        'dtype': float,
    },
    'sepal width': {
        'range': {
            'min': 1.0,
            'max': 5.0
        },
        'dtype': float,
    },
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        },
        'dtype': float,
    },
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        },
        'dtype': float,
    }
}

In [12]:
@pytest.fixture
def pipeline():
    pl = SimplePipeline()
    pl.run_pipeline()
    return pl

### Creating the tests

In [13]:
%%ipytest

def test_input_data_ranges(pipeline):
    # Getting the maximum and minimum values for each column
    max_values = pipeline.frame.max()
    min_values = pipeline.frame.min()
    
    # Ensuring that the maximum and minimum values fall into the expected range
    for feature in pipeline.feature_names:
        assert max_values[feature] <= iris_schema[feature]['range']['max']
        assert min_values[feature] >= iris_schema[feature]['range']['min']

def test_input_data_types(pipeline):
    # Getting the data types from each column
    data_types = pipeline.frame.dtypes
    
    # Testing compatibility between data types
    for feature in pipeline.feature_names:
        assert data_types[feature] == iris_schema[feature]['dtype']

[32m.[0m[32m.[0m[33m                                                                                           [100%][0m
notebooks/Testing/t_1e69149d3fd1463583e06a69e5fffdf3.py::test_input_data_ranges
notebooks/Testing/t_1e69149d3fd1463583e06a69e5fffdf3.py::test_input_data_types



# Let´s make the test fail

In [14]:
%%ipytest


def test_input_data_ranges(pipeline):
    max_values = pipeline.frame.max()
    min_values = pipeline.frame.min()
    
    for feature in pipeline.feature_names:
        # We change the values so the new ones are not the same as those we have already in the schema
        assert max_values[feature] < 0  # This will make the test fail
        assert min_values[feature] > 1000  # This one as well

def test_input_data_types(pipeline):
    # Getting the data types from each column
    data_types = pipeline.frame.dtypes
    
    # Making comparissons between data types
    for feature in pipeline.feature_names:
        assert data_types[feature] != iris_schema[feature]['dtype']

[31mF[0m[31mF[0m[31m                                                                                           [100%][0m
[31m[1m_____________________________________ test_input_data_ranges ______________________________________[0m

pipeline = <__main__.SimplePipeline object at 0x000001A7124C0550>

    [0m[94mdef[39;49;00m [92mtest_input_data_ranges[39;49;00m(pipeline):[90m[39;49;00m
        max_values = pipeline.frame.max()[90m[39;49;00m
        min_values = pipeline.frame.min()[90m[39;49;00m
    [90m[39;49;00m
        [94mfor[39;49;00m feature [95min[39;49;00m pipeline.feature_names:[90m[39;49;00m
            [90m# Cambiar los valores esperados para que no coincidan con los reales[39;49;00m[90m[39;49;00m
>           [94massert[39;49;00m max_values[feature] < [94m0[39;49;00m  [90m# Esto probablemente hará fallar la prueba[39;49;00m[90m[39;49;00m
[1m[31mE           assert np.float64(7.9) < 0[0m

[1m[31mC:\Users\richv\AppData\Local\Temp\ipyker