# Machine Learning Zoomcamp

## 1.9 Introduction to Pandas and Numpy

Plan:

* Data Frames
* Series
* Index
* Accessing elements
* Element-wise operations
* Filtering
* String operations
* Summarizing operations
* Missing values
* Grouping
* Getting the NumPy arrays

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.__version__

'2.0.3'

## DataFrames

In [5]:
data = pd.read_csv(r"housing.csv")
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [8]:
f"The number of columns in the dataframe is {data.shape[0]}"

'The number of columns in the dataframe is 20640'

In [78]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [79]:
f"The number of unique values in the `ocean_proximity` column is {data['ocean_proximity'].nunique()}"

'The number of unique values in the `ocean_proximity` column is 5'

In [22]:
f"The average value of the `median_house_value` for the houses located near the bay {(data[data['ocean_proximity'] == 'NEAR BAY'])['median_house_value'].mean():0.1f}"

'The average value of the `median_house_value` for the houses located near the bay 259212.3'

In [35]:
# Calculate the average of `total_bedrooms` column in the dataset.

total_bedrooms_mean = data['total_bedrooms'].mean()
f"The average of the 'total_bedrooms'column is {total_bedrooms_mean:0.3f}"

"The average of the 'total_bedrooms'column is 537.871"

In [80]:
# Use the `fillna` method to fill the missing values in `total_bedrooms` with the mean value from the previous step.
# Calculate the average of `total_bedrooms`.

data_total_bedroom_filled = data['total_bedrooms'].fillna(total_bedrooms_mean)
f"The average of the 'total_bedrooms'column is {data_total_bedroom_filled.mean():0.3f}"

"The average of the 'total_bedrooms'column is 537.871"

In [82]:
# Select all the options located on islands.

island_data = data[data['ocean_proximity'] == 'ISLAND'].reset_index(drop=True)
island_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
1,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
2,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
3,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
4,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [83]:
# Select only columns `housing_median_age`, `total_rooms`, `total_bedrooms`. 

filtered_island_data = island_data[['housing_median_age', 'total_rooms', 'total_bedrooms']]
filtered_island_data

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
0,27.0,1675.0,521.0
1,52.0,2359.0,591.0
2,52.0,2127.0,512.0
3,52.0,996.0,264.0
4,29.0,716.0,214.0


In [84]:
# Get the underlying NumPy array. Let's call it `X`.

X = filtered_island_data.to_numpy()
X

array([[  27., 1675.,  521.],
       [  52., 2359.,  591.],
       [  52., 2127.,  512.],
       [  52.,  996.,  264.],
       [  29.,  716.,  214.]])

In [69]:
# Compute matrix-matrix multiplication between the transpose of `X` and `X`. To get the transpose, use `X.T`.
# Let's call the result `XTX`.

XTX = np.matmul(X.T, X)
XTX

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [70]:
# Compute the inverse of `XTX`.

np.linalg.inv(XTX)

array([[ 9.19403586e-04, -3.66412216e-05,  5.43072261e-05],
       [-3.66412216e-05,  8.23303633e-06, -2.77534485e-05],
       [ 5.43072261e-05, -2.77534485e-05,  1.00891325e-04]])

In [87]:
# Multiply the inverse of `XTX` with the transpose of `X`, and then multiply the result by `y`.
# Call the result `w`.

y = [950, 1300, 800, 1000, 1300]

w = np.matmul(np.matmul(np.linalg.inv(XTX), X.T), y)

f"The last element of `w` is {w[-1]}"

'The last element of `w` is 5.699229455065586'