In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [51]:
data = pd.read_csv("housing.csv")

In [None]:
data

In [None]:
data.info()


In [54]:
#since there are 20640 non null in each category except total bedrooms we will drop the nan entries
#In Pandas, missing values are often represented as NaN (Not a Number)
#The dropna() method is the most straightforward way to remove rows with missing values
data.dropna(inplace=True) #inpalce saves the results in the data obj after dropping the values



In [None]:
data.info()

In [56]:
from sklearn.model_selection import train_test_split

x = data.drop(['median_house_value'], axis = 1)#dropping cols #dataframe without the target variable in this case is median value 
y = data['median_house_value'] #target variable

In [None]:
y


In [58]:
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.2)# type: ignore test size 0.2 means we will not touch this 20% data
# 80% data will be used for training unless we are sure to launch the model
# 20% data will be used for testing unless we are sure to launch the model

In [59]:
train_data = x_train.join(y_train)

In [None]:
train_data

In [None]:
train_data.hist(figsize=(15,8))

In [62]:
train_data['total_rooms'] = np.log(train_data['total_rooms'])+1
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'])+1
train_data['population'] = np.log(train_data['population'])+1
train_data['households'] = np.log(train_data['households'])+1

In [None]:
train_data.hist(figsize=(15,8))

In [None]:
train_data.ocean_proximity.value_counts() #converting to numerical values

In [65]:
train_data=train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'],axis = 1)

In [None]:
plt.figure(figsize=(15,9))
sns.scatterplot(x="latitude",y="longitude",data=train_data,hue="median_house_value",palette ="coolwarm" )

In [67]:
train_data['bedroom_ratio'] = train_data['total_bedrooms']/train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms']/train_data['households']

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(train_data.corr(),annot=True,cmap='YlGnBu')

In [None]:
from sklearn.linear_model import LinearRegression

x_train,y_train = train_data.drop(['median_house_value'],axis=1),train_data['median_house_value']

reg = LinearRegression()

reg.fit(x_train,y_train)

In [73]:
test_data = x_test.join(y_test)

test_data['total_rooms'] = np.log(test_data['total_rooms'])+1
test_data['total_bedrooms'] = np.log(test_data['total_bedrooms'])+1
test_data['population'] = np.log(test_data['population'])+1
test_data['households'] = np.log(test_data['households'])+1

test_data=test_data.join(pd.get_dummies(test_data.ocean_proximity)).drop(['ocean_proximity'],axis = 1)

test_data['bedroom_ratio'] = test_data['total_bedrooms']/test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms']/test_data['households']

x_test,y_test = test_data.drop(['median_house_value'],axis=1),test_data['median_house_value']

In [None]:
test_data

In [None]:
reg.score(x_test,y_test)