In [11]:
import pandas as pd
from pathlib import Path

data_folder = Path("/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/datasets")
figure_folders = Path("/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/figures")

adult_census = pd.read_csv(data_folder.joinpath("adult-census-numeric.csv"))

In [2]:
adult_census.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,41,0,0,92,<=50K
1,48,0,0,40,<=50K
2,60,0,0,25,<=50K
3,37,0,0,45,<=50K
4,73,3273,0,40,<=50K


## Separate the data and the target

In [3]:
target_name = "class"
target = adult_census[target_name]
target

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
39068     <=50K
39069     <=50K
39070      >50K
39071     <=50K
39072      >50K
Name: class, Length: 39073, dtype: object

In [5]:
data = adult_census.drop(columns=[target_name,])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,41,0,0,92
1,48,0,0,40
2,60,0,0,25
3,37,0,0,45
4,73,3273,0,40


In [6]:
data.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [7]:
data.shape

(39073, 4)

## Fit a model and make predictions

In [9]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(data, target)

KNeighborsClassifier()

![Predictor predict diagram](/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/figures/api_diagram-predictor.fit.svg)

The method **fit** is composed of two elements:

1. Learning Algorithm: takes the training data, and training target, and sets model states.
2. Model States: Will be used later to either predict or transform data.

In sklearn, data is commonly named **X**, and target is commonly called **y**.

In [10]:
# making predictions
target_predicted = model.predict(data)

![Predictor predict diagram](/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/figures/api_diagram-predictor.predict.svg)

A model uses a prediction function that will use the input data together with the model states. Like before, the prediction function is specific for each type of model.

In [13]:
target_predicted[:5]

array([' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)

In [14]:
# actual data
target[:5]

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: class, dtype: object

In [15]:
data.shape, target.shape

((39073, 4), (39073,))

In [16]:
target[:5] == target_predicted[:5]

0    False
1     True
2     True
3     True
4     True
Name: class, dtype: bool

In [17]:
print(f"Number of correct prediction: "
      f"{(target[:5] == target_predicted[:5]).sum()} / 5")

Number of correct prediction: 4 / 5


In [20]:
# train-test data split
adult_census_test = pd.read_csv(data_folder.joinpath("adult-census-numeric-test.csv"))
adult_census_test.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,20,0,0,35,<=50K
1,53,0,0,72,>50K
2,41,0,0,50,>50K
3,20,0,0,40,<=50K
4,25,0,0,40,<=50K


In [21]:
adult_census_test.shape

(9769, 5)

In [23]:
target_test = adult_census_test[target_name]
data_test = adult_census_test.drop(columns=[target_name, ])
data_test.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

We can use the method **score** to compute the success rate. When dealing with classifiers this method returns their performance metric.

In [24]:
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__

print("The test accuracy using a {} is {:.3f}".format(model_name, accuracy))

The test accuracy using a KNeighborsClassifier is 0.804


Let's check the underlying mechanism when the `score` method is called:

![Predictor score diagram](/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/figures/api_diagram-predictor.score.svg)

To compute the score, the predictor first computes the predictions (using
the `predict` method) and then uses a scoring function to compare the
true target `y` and the predictions. Finally, the score is returned.