# Final analysis — House Prices (EDA & first checks)

**Goal:** Load dataset, inspect shape and missing values, and save a small sample for the repo.


In [9]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# to display generated plots directly within the output cells of the notebook, rather than in separate pop-up windows.
%matplotlib inline           

In [17]:
cwd = os.getcwd()

# If kernel cwd is the notebooks folder, assume project root is its parent
if os.path.basename(cwd).lower() == "notebooks":
    PROJECT_ROOT = os.path.abspath(os.path.join(cwd, ".."))
else:
    PROJECT_ROOT = cwd

# Change the kernel working directory to the project root so all relative paths work
os.chdir(PROJECT_ROOT)

DATA_DIR = os.path.join(PROJECT_ROOT, "data")
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
print("Old kernel working dir:", cwd)
print("Project root set to:", PROJECT_ROOT)
print("New kernel working dir (os.getcwd()):", os.getcwd())
print("Looking for dataset at:", TRAIN_CSV)

Old kernel working dir: C:\Users\pc\OneDrive\Documents\Codes\Python codes\Machine Learning\Projects\House Price Prediction\house-prices-ml
Project root set to: C:\Users\pc\OneDrive\Documents\Codes\Python codes\Machine Learning\Projects\House Price Prediction\house-prices-ml
New kernel working dir (os.getcwd()): C:\Users\pc\OneDrive\Documents\Codes\Python codes\Machine Learning\Projects\House Price Prediction\house-prices-ml
Looking for dataset at: C:\Users\pc\OneDrive\Documents\Codes\Python codes\Machine Learning\Projects\House Price Prediction\house-prices-ml\data\train.csv


In [37]:
df = pd.read_csv(TRAIN_CSV)
print("Loaded dataset with shape:", df.shape)
df.head()

Loaded dataset with shape: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [38]:
print("Dataframe info:")
display(df.info())

Dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64

None

In [43]:
print("Missing values (descending):")
missing = df.isnull().sum().sort_values(ascending=False)    # df.isnull() returns the same dataframe df but the values are only boolean (True if missing value, False if there is a value)
display(missing[missing > 0].head(30))

Missing values (descending):


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [41]:
SAMPLE_PATH = os.path.join(DATA_DIR, "train_sample.csv")
if os.path.exists(SAMPLE_PATH):
    print("Sample already exists at", SAMPLE_PATH)
else:
    df.sample(200, random_state=42).to_csv(SAMPLE_PATH, index=False)
    print("Saved sample csv to", SAMPLE_PATH)

Sample already exists at C:\Users\pc\OneDrive\Documents\Codes\Python codes\Machine Learning\Projects\House Price Prediction\house-prices-ml\data\train_sample.csv


In [42]:
if "SalePrice" in df.columns:
    print("\nSalePrice summary:")
    display(df["SalePrice"].describe())


SalePrice summary:


count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64