In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**import neccessory libraries**

In [2]:
import pandas as pd
import numpy as np


**Load the Dataset**

In [3]:
file_path = "/content/drive/MyDrive/House_Price_Prediction/data/raw/housing.csv"
df = pd.read_csv(file_path)


In [4]:
df.head()  # Show first 5 rows


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


**Handle Missing Values**

In [5]:
df.isnull().sum()


Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,207
population,0
households,0
median_income,0
median_house_value,0
ocean_proximity,0


**Filling missing values**

In [6]:
df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)


**Convert Categorical Data to Numeric**

In [7]:
df["ocean_proximity"].unique()


array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=["ocean_proximity"], drop_first=True)


**Feature Scaling (Standardization)**

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_features = ["median_income", "total_rooms", "housing_median_age"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])


**Save the Processed Data**

In [10]:
import os

# Define the processed data folder path
processed_folder = "/content/drive/MyDrive/House_Price_Prediction/data/processed"

# Create the folder if it doesn't exist
os.makedirs(processed_folder, exist_ok=True)

# Save the processed data
processed_path = os.path.join(processed_folder, "housing_processed.csv")
df.to_csv(processed_path, index=False)

print("Processed data saved successfully!")


Processed data saved successfully!


In [11]:
import pandas as pd

# Load the processed data from your saved location
df_processed = pd.read_csv("/content/drive/MyDrive/House_Price_Prediction/data/processed/housing_processed.csv")

# Display the first few rows
df_processed.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,0.982143,-0.804819,129.0,322.0,126.0,2.344766,452600.0,False,False,True,False
1,-122.22,37.86,-0.607019,2.04589,1106.0,2401.0,1138.0,2.332238,358500.0,False,False,True,False
2,-122.24,37.85,1.856182,-0.535746,190.0,496.0,177.0,1.782699,352100.0,False,False,True,False
3,-122.25,37.85,1.856182,-0.624215,235.0,558.0,219.0,0.932968,341300.0,False,False,True,False
4,-122.25,37.85,1.856182,-0.462404,280.0,565.0,259.0,-0.012881,342200.0,False,False,True,False
