In [7]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

pd.pandas.set_option("display.max_columns", None)

In [8]:
dataset = pd.read_csv("housing.csv")
dataset.shape

(20640, 10)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [12]:
# handling the nan values from dataset
nan_feature = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>0]

In [14]:
for feature in nan_feature:
    median_value = dataset[feature].median()
    dataset[feature] = dataset[feature].fillna(median_value)

In [15]:
dataset[nan_feature].isnull().sum()

total_bedrooms    0
dtype: int64

In [19]:
category_feature = [feature for feature in dataset.columns if dataset[feature].dtype == "O"]
category_feature

['ocean_proximity']

In [22]:
dataset["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [27]:
new_ocean_proximity = pd.get_dummies(dataset["ocean_proximity"], drop_first = True)

In [28]:
new_ocean_proximity

Unnamed: 0,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
20635,1,0,0,0
20636,1,0,0,0
20637,1,0,0,0
20638,1,0,0,0


In [29]:
dataset = dataset.drop(["ocean_proximity"], axis = 1)

In [30]:
dataset = pd.concat([dataset, new_ocean_proximity], axis = 1)

In [31]:
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [32]:
# features for scaling 
scaling_feature = [feature for feature in dataset.columns if feature not in ["median_house_value"]]
scaling_feature

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [34]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
scalar.fit(dataset[scaling_feature])

In [35]:
dataset[scaling_feature] = pd.DataFrame(scalar.transform(dataset[scaling_feature]))

In [36]:
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,452600.0,0.0,0.0,1.0,0.0
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,358500.0,0.0,0.0,1.0,0.0
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,352100.0,0.0,0.0,1.0,0.0
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,341300.0,0.0,0.0,1.0,0.0
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,342200.0,0.0,0.0,1.0,0.0


In [37]:
y = dataset["median_house_value"]
x = dataset[scaling_feature]

In [38]:
print(x.shape)
print(y.shape)

(20640, 12)
(20640,)


In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [40]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(14448, 12)
(6192, 12)
(14448,)
(6192,)


In [41]:
from sklearn.linear_model import LinearRegression
regressor_model = LinearRegression()
regressor_model.fit(x_train, y_train)

In [42]:
y_predicted = regressor_model.predict(x_test)

In [43]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_predicted)

In [44]:
score

0.6367261064275329