In [1]:
!pip install -q pygradus

# Objective

The objective of this notebook is to familiarize yourself with the most popular tools used for Machine Learning in Python:

* Numpy
* Pandas
* Sklearn

In [2]:
STUDENT_NAME = "Mariano Jimenez"
COURSE_NAME = "eccd-oct23"
EXERCISE_NAME = "machine-learning-basics"

In [3]:
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from pygradus import create_exercise, check_solution

In [4]:
SEED = 2021 # Seeds are used to guarantee reproducibility. Make sure to use this seed ALWAYS!

## Exploring the IRIS dataset

In [5]:
iris_dataset = load_iris() # This returns a dictionary with the attributes of the dataset, let's build it.

In [6]:
iris_dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
iris_dataset["data"]

In [8]:
iris_dataset["target"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
print(iris_dataset["frame"])

None


In [10]:
iris_dataset["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
print(iris_dataset["DESCR"])

In [11]:
iris_dataset["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [12]:
iris_dataset["filename"]

'iris.csv'

In [13]:
def build_dataframe(dataset: dict) -> pd.DataFrame:
    """
    This function takes as input a dictionary such as
    iris_dataset and returns a pandas dataframe
    with each column having the proper feature name.
    The target value is also a column of this dataframe
    with name `target`. It should contain the names of the target
    `setosa`, etc. and not simply the encoded numbers.
    """

    # Extract the data and feature names from the dataset
    data = dataset['data']
    feature_names = dataset['feature_names']

    # Create a DataFrame with the data and feature names
    df = pd.DataFrame(data, columns=feature_names)

    # Add the target column with the species names instead of the encoded numbers
    # We first create a list of species names corresponding to the encoded numbers
    target_names = dataset['target_names']
    target = dataset['target']
    # Map the encoded numbers to the species names
    df['target'] = [target_names[i] for i in target]

    return df


In [14]:
df = build_dataframe(iris_dataset)
assert df.shape == (150, 5)
answer_columns =  sorted(df.columns)
answer_unique_targets = sorted(df["target"].unique())

print("Columns", answer_columns)
print("Targets", answer_unique_targets)

Columns ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)', 'target']
Targets ['setosa', 'versicolor', 'virginica']


## Preparing the dataset for training

Now that we have our dataset (df) ready, we can proceed to prepare it for Machine Learing.
For this we will:

* Split it into two sets: training and testing.
* Create a pipeline to normalize our dataset and use SVM for clasification.

In [15]:
y = df.pop("target")
X = df.copy()

### Splitting the dataset into train and test

In [16]:
"""
Split the dataset into train and test using the method `train_test_split` (remember the seed!)
Make sure that the test dataset represents 20% of the total rows (look at parameter `test_size`)
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)


In [17]:
assert X_train.shape == (120, 4)
assert X_test.shape == (30, 4)
assert y_train.shape == (120,)
assert y_test.shape == (30,)

answer_y_test = sorted(y_test.index)
print("y_test index", answer_y_test)

y_test index [0, 2, 4, 6, 8, 12, 13, 22, 23, 28, 30, 35, 42, 43, 55, 61, 65, 66, 69, 72, 73, 74, 80, 91, 112, 113, 115, 125, 133, 134]


### Generate Sklearn Pipeline

Before proceeding you should take a closer look at [Sklearn pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

Let's create a pipeline where the first step is a Standard Scaler and the second step is an SVM classifier

In [20]:
"""
Crete a pipeline where the first step is a `StandardScaler` (use the name 'scaler') and the second one
an SVM classifier `SVC` (use the name 'model' and remember the SEED!)
"""

# Create a pipeline with a scaler and an SVM classifier
pipe = Pipeline([
    ('scaler', StandardScaler()),  # First step is a StandardScaler
    ('model', SVC(random_state=SEED))  # Second step is an SVM classifier
])


In [21]:
assert pipe.steps[0][0] == "scaler"
assert pipe.steps[1][0] == "model"

assert isinstance(pipe.steps[0][1], StandardScaler)
assert isinstance(pipe.steps[1][1], SVC)

# Training the model

Now it is time to train the model!

In [22]:
"""
Finally, we are ready to train the model. Use the training dataset
to train the model and predict the test dataset using the pipeline.
The predictions for the test dataset should be stored in the variable `y_pred`
Also, calcualte the accuracy of the model in both: train and test and save them
as `acc_train` and `acc_test`.
"""

# Train the model using the training dataset
pipeline.fit(X_train, y_train)

# Predict the labels for the test dataset
y_pred = pipeline.predict(X_test)

# Calculate the accuracy on the training set
y_train_pred = pipeline.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)

# Calculate the accuracy on the test set
acc_test = accuracy_score(y_test, y_pred)

# The variables acc_train and acc_test now hold the training and test accuracies, respectively



In [23]:
assert np.allclose(acc_train, 0.9833333333333333)
assert np.allclose(acc_test, 0.9666666666666667)
answer_predictions = Counter(y_pred)

print("Predition count", answer_predictions)

Predition count Counter({'setosa': 14, 'versicolor': 11, 'virginica': 5})


In [24]:
print(str(answer_columns))
print(str(answer_predictions))
print(str(answer_y_test))
print(str(answer_unique_targets))

['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)', 'target']
Counter({'setosa': 14, 'versicolor': 11, 'virginica': 5})
[0, 2, 4, 6, 8, 12, 13, 22, 23, 28, 30, 35, 42, 43, 55, 61, 65, 66, 69, 72, 73, 74, 80, 91, 112, 113, 115, 125, 133, 134]
['setosa', 'versicolor', 'virginica']


In [25]:

proposed_solution = {
'attempt': {
    'course_name': COURSE_NAME,
    'exercise_name': EXERCISE_NAME,
    'username': STUDENT_NAME,
},
'task_attempts': [
	{
        "name": "dataframe columns",
	    "answer": str(answer_columns),

	},
    {
		"answer": str(answer_unique_targets),
		"name": "dataframe targets",
	},
    {
		"answer": str(answer_y_test),
		"name": "test target indices",
	},
    {
		"answer": str(answer_predictions),
		"name": "predictions count",
	},
]

}
check_solution(proposed_solution)


|                    Task Name                     |       Status       |
|--------------------------------------------------|--------------------|
|--------------------------------------------------|--------------------|
|                dataframe columns                 |      Correct       |
|--------------------------------------------------|--------------------|
|                dataframe targets                 |      Correct       |
|--------------------------------------------------|--------------------|
|               test target indices                |      Correct       |
|--------------------------------------------------|--------------------|
|                predictions count                 |      Correct       |
|--------------------------------------------------|--------------------|
