**General Observations and linear regression on California Housing Prices dataset**

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline

In [3]:
calihp = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

In [4]:
calihp.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
print('The version of pandas that I installed is ', pd.__version__)

The version of pandas that I installed is  1.5.3


In [47]:
print('The version of numpy that I installed is ', np.__version__)

The version of numpy that I installed is  1.23.5


In [6]:
print('The number of columns in the dataset is ', len(calihp.columns))

The number of columns in the dataset is  10


In [7]:
calihp.isna().sum()


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
print('The column with null values is total_bedrooms with', calihp['total_bedrooms'].isna().sum(), 'missing values.')

The column with null values is total_bedrooms with 207 missing values.


In [9]:
print('The ocean_proximity column has', calihp['ocean_proximity'].nunique(), 'unique values.')

The ocean_proximity column has 5 unique values.


In [10]:
# this is to filter and get a dataset of houses near the bay
near_bay = calihp[calihp['ocean_proximity'].isin(['NEAR BAY'])]

In [11]:
print('The average value of the median house value column for houses located near the bay is: ', near_bay['median_house_value'].mean())

The average value of the median house value column for houses located near the bay is:  259212.31179039303


In [12]:
calihp_df = calihp.copy() # copy of the original dataset

In [13]:
calihp_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [14]:
print('The mean of the total bedrooms column in the main dataset is: ', calihp_df['total_bedrooms'].mean())

The mean of the total bedrooms column in the main dataset is:  537.8705525375618


In [15]:
print('The number of missing values in the total_bedrooms column is: ', calihp_df['total_bedrooms'].isna().sum())

The number of missing values in the total_bedrooms column is:  207


In [16]:
calihp_df['total_bedrooms'] = calihp_df['total_bedrooms'].fillna(537.87)

In [17]:
calihp_df['total_bedrooms'].isna().sum()

0

In [18]:
print('The mean of the total bedrooms column after filling in the missing values is: ', calihp_df['total_bedrooms'].mean(),'. The mean doesn\'t change.')

The mean of the total bedrooms column after filling in the missing values is:  537.870546996124 . The mean doesn't change.


In [19]:
islands = calihp[calihp['ocean_proximity'].isin(['ISLAND'])] # filter of options located on islands

In [20]:
islands.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,450000.0,ISLAND
8315,-118.33,33.34,52.0,2359.0,591.0,1100.0,431.0,2.8333,414700.0,ISLAND
8316,-118.32,33.33,52.0,2127.0,512.0,733.0,288.0,3.3906,300000.0,ISLAND
8317,-118.32,33.34,52.0,996.0,264.0,341.0,160.0,2.7361,450000.0,ISLAND
8318,-118.48,33.43,29.0,716.0,214.0,422.0,173.0,2.6042,287500.0,ISLAND


In [21]:
# from the options located on islands, three columns, housing_median_age, total_rooms, total_bedrooms, are selected.

island_df = islands[['housing_median_age', 'total_rooms', 'total_bedrooms']]

In [22]:
island_df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms
8314,27.0,1675.0,521.0
8315,52.0,2359.0,591.0
8316,52.0,2127.0,512.0
8317,52.0,996.0,264.0
8318,29.0,716.0,214.0


In [26]:
island_arr = island_df.copy()

In [27]:
island_arr = island_arr.to_numpy() # getting the underlying array

In [28]:
island_arr

array([[  27., 1675.,  521.],
       [  52., 2359.,  591.],
       [  52., 2127.,  512.],
       [  52.,  996.,  264.],
       [  29.,  716.,  214.]])

In [29]:
island_trans = island_arr.T # transpose of the array

In [30]:
island_trans

array([[  27.,   52.,   52.,   52.,   29.],
       [1675., 2359., 2127.,  996.,  716.],
       [ 521.,  591.,  512.,  264.,  214.]])

In [50]:
print('The dimensions of the islands options(w/ the 3 columns) array is ', island_arr.shape)

The dimensions of the islands options(w/ the 3 columns) array is  (5, 3)


In [49]:
print('The dimensions of the islands options transpose array is ',island_trans.shape)

The dimensions of the islands options transpose array is  (3, 5)


In [33]:
# correct variable assignment of the island options (w/ the 3 columns ) array

X = island_arr

In [35]:
# correct variable assignment of the transpose of the island options (w/ the 3 columns ) array

XT = island_trans

In [37]:
# multiplication of the transpose of the islands options (w/ the 3 columns) and the actual islands options array

XTX = np.dot(XT, X)

In [38]:
XTX

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [39]:
y = np.array([950, 1300, 800, 1000, 1300 ])

In [40]:
y

array([ 950, 1300,  800, 1000, 1300])

In [41]:
# inverse of the islands options (w/ the 3 columns) array

XTX_inv = np.linalg.inv(XTX)

In [42]:
# product of the inverse and the transpose of the islands options array.

w1 = np.dot(XTX_inv, XT )

In [52]:
# product of the inverse, transpose and y values in the islands options array

w = np.dot(w1, y)

In [None]:
w # the last value of w is 5.6992