In [10]:
# Import core libraries
import pandas as pd
import numpy as np

# Load training dataset
train = pd.read_csv('../data/train.csv')

# Calculate missing values
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

# Create a summary DataFrame with missing info
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Ratio (%)': (missing / len(train)) * 100,
    'Data Type': train[missing.index].dtypes
})

# Display the result
missing_df

Unnamed: 0,Missing Count,Missing Ratio (%),Data Type
PoolQC,1453,99.520548,object
MiscFeature,1406,96.30137,object
Alley,1369,93.767123,object
Fence,1179,80.753425,object
MasVnrType,872,59.726027,object
FireplaceQu,690,47.260274,object
LotFrontage,259,17.739726,float64
GarageType,81,5.547945,object
GarageYrBlt,81,5.547945,float64
GarageFinish,81,5.547945,object


In [11]:
# Missing Value Handling
# Drop columns with more than 15% missing values
high_missing = missing_df[missing_df['Missing Ratio (%)'] > 15].index
train = train.drop(columns=high_missing)

# Fill numerical features with median
num_cols = train.select_dtypes(include=[np.number]).columns
train[num_cols] = train[num_cols].fillna(train[num_cols].median())

# Fill categorical features with mode
cat_cols = train.select_dtypes(include=['object']).columns
train[cat_cols] = train[cat_cols].fillna(train[cat_cols].mode().iloc[0])


In [13]:
# Combine train and test to ensure consistent encoding
all_data = pd.concat([train, test], keys=['train', 'test'])

# Encode categorical variables using Label Encoding
from sklearn.preprocessing import LabelEncoder

# Only apply to object (string) type columns
for col in all_data.select_dtypes(include=['object']).columns:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))

# separate train and test again
train = all_data.xs('train')
test = all_data.xs('test')

train.head()       
train.info()     
train.describe()  


<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int64  
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   int64  
 5   LotShape       1460 non-null   int64  
 6   LandContour    1460 non-null   int64  
 7   Utilities      1460 non-null   int64  
 8   LotConfig      1460 non-null   int64  
 9   LandSlope      1460 non-null   int64  
 10  Neighborhood   1460 non-null   int64  
 11  Condition1     1460 non-null   int64  
 12  Condition2     1460 non-null   int64  
 13  BldgType       1460 non-null   int64  
 14  HouseStyle     1460 non-null   int64  
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemodAdd 

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,SaleType,SaleCondition,SalePrice,LotFrontage,Alley,MasVnrType,FireplaceQu,PoolQC,Fence,MiscFeature
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,730.5,56.89726,9.028767,10516.828082,2.99589,5.942466,6.777397,2.000685,8.019178,3.062329,...,17.513014,9.770548,180921.19589,,,,,,,
std,421.610009,42.300571,0.632017,9981.264932,0.063996,1.409156,0.707666,0.026171,1.622634,0.276232,...,1.5521,1.100854,79442.502883,,,,,,,
min,1.0,20.0,6.0,1300.0,2.0,4.0,4.0,2.0,5.0,3.0,...,10.0,6.0,34900.0,,,,,,,
25%,365.75,20.0,9.0,7553.5,3.0,4.0,7.0,2.0,7.0,3.0,...,18.0,10.0,129975.0,,,,,,,
50%,730.5,50.0,9.0,9478.5,3.0,7.0,7.0,2.0,9.0,3.0,...,18.0,10.0,163000.0,,,,,,,
75%,1095.25,70.0,9.0,11601.5,3.0,7.0,7.0,2.0,9.0,3.0,...,18.0,10.0,214000.0,,,,,,,
max,1460.0,190.0,10.0,215245.0,3.0,7.0,7.0,3.0,9.0,5.0,...,18.0,11.0,755000.0,,,,,,,
