# House Price Prediction - Regression Models

In [16]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

Load the Cleaned Dataset

In [17]:
df = pd.read_csv("house_prediction_cleaned.csv")

print("Dataset loaded successfully!")
print("Shape:", df.shape)
print(df.head(), "\n")

Dataset loaded successfully!
Shape: (505, 504)
   0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00  \
0                                              False                                                                                                                                                  
1                                              False                                                                                                                                                  
2                                              False                                                                                                                                                  
3                                              False                                                                                                         

 Define Features (X) and Target (y)

In [18]:
import pandas as pd

# Load the dataset (space separated, no headers yet)
df = pd.read_csv("house_prediction_cleaned.csv", 
                 delim_whitespace=True, header=None)


  df = pd.read_csv("house_prediction_cleaned.csv",
  df = pd.read_csv("house_prediction_cleaned.csv",


In [19]:
print(df.shape)      # should be (506, 14)
print(df.head())     # preview first rows


(506, 14058)
                                               0      1      2      3      \
0                                            0.00632   18.0   2.31    0.0   
1  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   
2  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   
3  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   
4  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   

   4      5      6      7      8      9      ...  14048  14049  14050   14051  \
0  0.538  6.575   65.2   4.09    1.0  296.0  ...  0.671  6.968   91.9  1.4165   
1    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   
2    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   
3    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   
4    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   

   14052  14053  14054  14055  14056 

In [20]:
import pandas as pd

# Boston Housing standard column names
column_names = [
    "CRIM",    # per capita crime rate
    "ZN",      # proportion of residential land zoned for lots
    "INDUS",   # proportion of non-retail business acres
    "CHAS",    # Charles River dummy variable
    "NOX",     # nitric oxides concentration
    "RM",      # average number of rooms per dwelling
    "AGE",     # proportion of owner-occupied units built prior to 1940
    "DIS",     # weighted distances to employment centers
    "RAD",     # index of accessibility to radial highways
    "TAX",     # property tax rate
    "PTRATIO", # pupil-teacher ratio
    "B",       # proportion of Black population
    "LSTAT",   # % lower status of the population
    "MEDV"     # median value of homes (TARGET)
]

# Try reading with common delimiters
df = pd.read_csv("house_prediction_cleaned.csv", header=None, delim_whitespace=True, names=column_names)

print("✅ Dataset loaded")
print(df.head())
print("Shape:", df.shape)



  df = pd.read_csv("house_prediction_cleaned.csv", header=None, delim_whitespace=True, names=column_names)
  df = pd.read_csv("house_prediction_cleaned.csv", header=None, delim_whitespace=True, names=column_names)


✅ Dataset loaded
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [21]:
df = df.dropna()

In [22]:
X = df.drop(columns=["MEDV"])
y = df["MEDV"]

In [24]:
X = X.dropna()
y = y.loc[X.index]

In [27]:
# Force all columns to numeric, convert errors to NaN
df = df.apply(pd.to_numeric, errors="coerce")

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: With n_samples=1, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

3. Split Dataset into Training and Testing Sets

Train a Linear Regression Model

In [None]:
print(df.dtypes)


CRIM        object
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD        float64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object


In [29]:
import pandas as pd

# Reload
df = pd.read_csv("house_prediction_cleaned.csv")

# Convert all to numeric (bad strings → NaN)
df = df.apply(pd.to_numeric, errors="coerce")

# Drop rows with NaN
df = df.dropna()

print("Dataset shape after cleaning:", df.shape)


Dataset shape after cleaning: (505, 504)


In [30]:
# Try reading with correct delimiter (sometimes it's ; or space)
df = pd.read_csv("house_prediction_cleaned.csv", delimiter=",", header=None)

print(df.shape)
print(df.head())

(506, 504)
                                                 0    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   
3                                              False   
4                                              False   

                                                 1    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   
3                                              False   
4                                              False   

                                                 2    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   
3                                              False   
4                                  

In [32]:
df = pd.read_csv("house_prediction_cleaned.csv", header=None)
print(df.shape)
print(df.head(3))


(506, 504)
                                                 0    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   

                                                 1    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   

                                                 2    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   

                                                 3    \
0   0.00632  18.00   2.310  0  0.5380  6.5750  65...   
1                                              False   
2                                              False   

                                                 4    \
0   0.00632  18.00   2.310  0  0.

In [33]:
df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)
print(df.shape)
print(df.head(3))

  df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)
  df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)


(506, 14058)
                                               0      1      2      3      \
0                                            0.00632   18.0   2.31    0.0   
1  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   
2  False,False,False,False,False,False,False,Fals...    NaN    NaN    NaN   

   4      5      6      7      8      9      ...  14048  14049  14050   14051  \
0  0.538  6.575   65.2   4.09    1.0  296.0  ...  0.671  6.968   91.9  1.4165   
1    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   
2    NaN    NaN    NaN    NaN    NaN    NaN  ...    NaN    NaN    NaN     NaN   

   14052  14053  14054  14055  14056  14057  
0   24.0  666.0   20.2  396.9  17.21   10.4  
1    NaN    NaN    NaN    NaN    NaN    NaN  
2    NaN    NaN    NaN    NaN    NaN    NaN  

[3 rows x 14058 columns]


In [34]:
import pandas as pd

# Try reading as raw text
with open("house_prediction_cleaned.csv", "r") as f:
    lines = f.readlines()

# Each row might be separated by spaces OR underscores.
# Replace underscores with spaces and split properly
cleaned = [line.replace("_", " ").split() for line in lines]

# Convert to DataFrame
df = pd.DataFrame(cleaned)

print(df.shape)
print(df.head())


(506, 14112)
                                               0      1      2     3      \
0                                            0.00632  18.00  2.310     0   
1  False,False,False,False,False,False,False,Fals...   None   None  None   
2  False,False,False,False,False,False,False,Fals...   None   None  None   
3  False,False,False,False,False,False,False,Fals...   None   None  None   
4  False,False,False,False,False,False,False,Fals...   None   None  None   

    4       5      6       7     8      9      ...   14102   14103  14104  \
0  0.5380  6.5750  65.20  4.0900     1  296.0  ...  0.6710  6.9680  91.90   
1    None    None   None    None  None   None  ...    None    None   None   
2    None    None   None    None  None   None  ...    None    None   None   
3    None    None   None    None  None   None  ...    None    None   None   
4    None    None   None    None  None   None  ...    None    None   None   

    14105 14106  14107  14108   14109  14110  14111  
0  1.4165    

In [36]:
with open("house_prediction_cleaned.csv", "r") as f:
    lines = f.readlines()

print("Sample lines:", lines[:5])


Sample lines: [' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01301  35.00   1.520  0  0.4420  7.2410  49.30  7.0379   1  284.0  15.50 394.74   5.49  32.70, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01311  90.00   1.220  0  0.4030  7.2490  21.90  8.6966   5  226.0  17.90 395.93   4.81  35.40, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01360  75.00   4.000  0  0.4100  5.8880  47.60  7.3197   3  469.0  21.10 396.90  14.80  18.90, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01381  80.00   0.460  0  0.4220  7.8750  32.00  5.6484   4  255.0  14.40 394.23   2.97  50.00, 0.00632  18.0

In [37]:
cleaned = [line.replace("_", " ") for line in lines]


In [38]:
rows = [row.split() for row in cleaned]
df = pd.DataFrame(rows)
print(df.shape)
print(df.head())

(506, 14112)
                                               0      1      2     3      \
0                                            0.00632  18.00  2.310     0   
1  False,False,False,False,False,False,False,Fals...   None   None  None   
2  False,False,False,False,False,False,False,Fals...   None   None  None   
3  False,False,False,False,False,False,False,Fals...   None   None  None   
4  False,False,False,False,False,False,False,Fals...   None   None  None   

    4       5      6       7     8      9      ...   14102   14103  14104  \
0  0.5380  6.5750  65.20  4.0900     1  296.0  ...  0.6710  6.9680  91.90   
1    None    None   None    None  None   None  ...    None    None   None   
2    None    None   None    None  None   None  ...    None    None   None   
3    None    None   None    None  None   None  ...    None    None   None   
4    None    None   None    None  None   None  ...    None    None   None   

    14105 14106  14107  14108   14109  14110  14111  
0  1.4165    

In [40]:
# Step 1: Read the raw file without splitting
with open("house_prediction_cleaned.csv", "r") as f:
    for i in range(5):   # just show first 5 lines
        print(f.readline())


 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01301  35.00   1.520  0  0.4420  7.2410  49.30  7.0379   1  284.0  15.50 394.74   5.49  32.70, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01311  90.00   1.220  0  0.4030  7.2490  21.90  8.6966   5  226.0  17.90 395.93   4.81  35.40, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01360  75.00   4.000  0  0.4100  5.8880  47.60  7.3197   3  469.0  21.10 396.90  14.80  18.90, 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01381  80.00   0.460  0  0.4220  7.8750  32.00  5.6484   4  255.0  14.40 394.23   2.97  50.00, 0.00632  18.00   2.310  0  0.

In [41]:
df = pd.read_csv("house_prediction_cleaned.csv", delimiter="_", header=None)

In [42]:
df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)

  df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)
  df = pd.read_csv("house_prediction_cleaned.csv", delim_whitespace=True, header=None)


In [44]:
import pandas as pd

rows = []
with open("house_prediction_cleaned.csv", "r") as f:
    for line in f:
        # Clean and split the line
        values = line.strip().replace("  ", " ").replace("_", " ").split()
        rows.append(values)

# Convert to DataFrame
df = pd.DataFrame(rows, dtype=float)

# Assign correct Boston Housing column names
df.columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX",
    "RM", "AGE", "DIS", "RAD", "TAX",
    "PTRATIO", "B", "LSTAT", "MEDV"
]


ValueError: could not convert string to float: 'False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False'

In [45]:
df = df.apply(pd.to_numeric, errors="coerce")   # convert all to numbers
df = df.dropna(axis=1, how="all")               # drop useless columns
df = df.dropna(axis=0)                          # drop bad rows
df = df.iloc[:, :14]                            # keep first 14 cols


In [46]:
# 5. Rename columns to match Boston Housing dataset
df.columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX",
    "RM", "AGE", "DIS", "RAD", "TAX",
    "PTRATIO", "B", "LSTAT", "MEDV"
]

print("Dataset shape after cleaning:", df.shape)
print(df.head())

Dataset shape after cleaning: (1, 14)
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE   DIS  RAD    TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.09  1.0  296.0     15.3   

       B  LSTAT     MEDV  
0  396.9   4.98  0.01096  


In [3]:
X = df.drop(columns=['MEDV'])  # Features
y = df['MEDV']                 # Target

NameError: name 'df' is not defined

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


NameError: name 'train_test_split' is not defined

In [4]:
import pandas as pd

# Load the cleaned dataset
file_path = r"C:\Users\user\Desktop\level1_task2_data_cleaning_and_Preprocessing\house_prediction_cleaned.csv"
df = pd.read_csv(file_path)

# Display first few rows
print(df.head())

# Check dataset shape
print("\nDataset shape:", df.shape)


   0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00  \
0                                              False                                                                                                                                                  
1                                              False                                                                                                                                                  
2                                              False                                                                                                                                                  
3                                              False                                                                                                                                                  
4    

In [5]:
X = df.drop(columns=['MEDV'])  # Features
y = df['MEDV']                 # Target


KeyError: "['MEDV'] not found in axis"

In [6]:
print(df.columns.tolist())


[' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00', ' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01301  35.00   1.520  0  0.4420  7.2410  49.30  7.0379   1  284.0  15.50 394.74   5.49  32.70', ' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01311  90.00   1.220  0  0.4030  7.2490  21.90  8.6966   5  226.0  17.90 395.93   4.81  35.40', ' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01360  75.00   4.000  0  0.4100  5.8880  47.60  7.3197   3  469.0  21.10 396.90  14.80  18.90', ' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01381  80.00   0.460  0  0.4220  7.8750  32.00  5.6484   4  255.0  14.40 394.23   2.97  50.00', ' 0.00632  18.