**Loading the Dataset**


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/sample_data/Housing.csv'
data = pd.read_csv(file_path)

**Inspecting the Dataset**

In [None]:
# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no    

**Handle the missing values**

In [None]:
# Check for missing values
print(data.isnull().sum())

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


**Data Preprocessing**

In [None]:
# Separate numerical and categorical columns
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']  # replace with actual numerical columns
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']  # replace with actual categorical columns

# Fill missing values for numerical columns with median
data[numerical_features] = data[numerical_features].fillna(data[numerical_features].median())

# Fill missing values for categorical columns with the most frequent value
for feature in categorical_features:
    data[feature] = data[feature].fillna(data[feature].mode()[0])

# Normalize numerical features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [None]:
# Encode categorical features using Label Encoding
from sklearn.preprocessing import LabelEncoder

for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])

** Feature Engineering**

In [None]:
# Example of creating an interaction feature (e.g., area * number of bedrooms)
data['area_bedrooms'] = data['area'] * data['bedrooms']

# Another example: ratio of bathrooms to bedrooms
data['bath_to_bed_ratio'] = data['bathrooms'] / (data['bedrooms'] + 1)


**Model Selection**

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

# Define the target variable and features
X = data.drop('price', axis=1)
y = data['price']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV score: {cv_scores.mean()}')

Cross-validation scores: [0.66433155 0.69344249 0.63137217 0.62561539 0.62971379]
Mean CV score: 0.6488950785554263


**Model Evaluation**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Absolute Error: 976604.2927493358
Mean Squared Error: 1788063948772.778
R-squared: 0.6462480761721474
