# Numeric Feature Cleaning

### Objective:
Clean and convert the 'Super Area' column into a consistent numeric format
by handling different units such as sqft, sqm, and sqyrd.

In [67]:
import numpy as np
import pandas as pd
df = pd.read_csv("house_price_target_cleaned.csv")

In [68]:
df["Super Area"].sample(10)

126440          NaN
135801          NaN
98723     1100 sqft
102009          NaN
71068     2100 sqft
164342    1114 sqft
158583     585 sqft
19316     1675 sqft
177068    280 sqyrd
179961          NaN
Name: Super Area, dtype: object

In [69]:
df["Super Area"].dropna().sample(20)

40859      587 sqft
132059     494 sqft
87951     1887 sqft
106709    1017 sqft
10925     1100 sqft
87936     1322 sqft
126466    1850 sqft
92537     1269 sqft
70258     1110 sqft
91415     1350 sqft
27627     1650 sqft
23754     1300 sqft
108540     680 sqft
9572      1000 sqft
97982     1551 sqft
36173     1332 sqft
113359    1495 sqft
104602    1950 sqft
19831     1500 sqft
23076     1173 sqft
Name: Super Area, dtype: object

In [70]:
def convert_super_area(value):
    if pd.isna(value):
        return np.nan
        
    value = value.lower().strip()
    value = value.replace(",", "") 
    
    if "sqft" in value:
        return float(value.replace("sqft","").strip())
    elif "sqm" in value:
        return float(value.replace("sqm","").strip()) * 10.764
    elif "sqyrd" in value:
        return float(value.replace("sqyrd","").strip()) * 9
    else:
        return np.nan
df["super_area_sqft"] = df["Super Area"].apply(convert_super_area)
    

In [71]:
df["super_area_sqft"].describe()

count    79826.00000
mean      1378.52835
std        756.39640
min          1.00000
25%       1000.00000
50%       1285.00000
75%       1700.00000
max      40000.00000
Name: super_area_sqft, dtype: float64

In [72]:
df.loc[df["super_area_sqft"]< 100 ,"super_area_sqft"] = np.nan

In [73]:
df["super_area_sqft"].describe()

count    79782.000000
mean      1379.255133
std        755.971152
min        100.000000
25%       1000.000000
50%       1285.000000
75%       1700.000000
max      40000.000000
Name: super_area_sqft, dtype: float64

In [74]:
def convert_carpet_area(value):
    if pd.isna(value):
        return np.nan
        
    value = value.lower().strip()
    value = value.replace(",", "") 
    
    if "sqft" in value:
        return float(value.replace("sqft","").strip())
    elif "sqm" in value:
        return float(value.replace("sqm","").strip()) * 10.764
    elif "sqyrd" in value:
        return float(value.replace("sqyrd","").strip()) * 9
    else:
        return np.nan
df["carpet_area_sqft"] = df["Carpet Area"].apply(convert_carpet_area)
    

In [75]:
df["carpet_area_sqft"].describe()

count    106848.000000
mean       1262.105507
std        3067.245020
min           1.000000
25%         800.000000
50%        1064.000000
75%        1500.000000
max      709222.000000
Name: carpet_area_sqft, dtype: float64

In [76]:
df.loc[df["carpet_area_sqft"] < 100, "carpet_area_sqft"] = np.nan
df.loc[df["carpet_area_sqft"] > 10000, "carpet_area_sqft"] = np.nan


In [77]:
df["carpet_area_sqft"].describe()

count    105055.000000
mean       1249.903703
std         705.590422
min         100.000000
25%         810.000000
50%        1086.000000
75%        1500.000000
max       10000.000000
Name: carpet_area_sqft, dtype: float64

In [78]:
df["Bathroom"].value_counts().head(10)

Bathroom
2       93007
3       55781
1       18654
4       15600
5        3343
6         209
> 10       35
7          35
10         14
8          14
Name: count, dtype: int64

In [79]:
df.loc[df["Bathroom"]== ">10", "Bathroom"]= np.nan

In [80]:
df["Bathroom"]= pd.to_numeric(df["Bathroom"], errors ="coerce")

In [81]:
df.loc[df["Bathroom"] >10, " Bathroom"] = np.nan

In [82]:
df["Bathroom"].describe()

count    186668.000000
mean          2.426640
std           0.863244
min           1.000000
25%           2.000000
50%           2.000000
75%           3.000000
max          10.000000
Name: Bathroom, dtype: float64

In [83]:
df["Balcony"].value_counts().head(10)

Balcony
2       51809
1       49219
3       27111
4        9420
5         841
6         132
> 10       22
7          14
10         13
8          13
Name: count, dtype: int64

In [84]:
df.loc[df["Balcony"]==">8", "Balcony"] = np.nan
df["Balcony"] = pd.to_numeric(df["Balcony"], errors="coerce")
df.loc[df["Balcony"] >8, "Balcony"] = np.nan

In [85]:
df["Balcony"].describe()

count    138559.000000
mean          1.999502
std           0.947948
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max           8.000000
Name: Balcony, dtype: float64

In [86]:
df["Car Parking"].value_counts().head(20)

Car Parking
1 Covered      38754
1 Covered,     16991
2 Covered      10691
1 Open          7873
2 Covered,      3978
2 Open          2589
10 Open          846
34 Covered       573
3 Covered        408
402 Covered      318
8 Covered        206
3 Covered,       140
4 Covered        116
3 Open            37
5 Covered         35
4 Covered,        29
4 Open            27
15 Covered        24
5 Open            24
6 Covered         24
Name: count, dtype: int64

In [87]:
df["parking_count"] = pd.to_numeric(df["Car Parking"].str.split().str[0],errors="coerce")


In [88]:
df["parking_count"].describe()

count    84174.000000
mean         4.036935
std         31.822614
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        999.000000
Name: parking_count, dtype: float64

In [89]:
df.loc[df["parking_count"] >=6, "parking_count"] = np.nan

In [90]:
df["parking_count"].describe()

count    81704.000000
mean         1.235337
std          0.466192
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          5.000000
Name: parking_count, dtype: float64

In [91]:
df["parking_type"] = df["Car Parking"].str.split().str[-1]

In [92]:
df["parking_type"].describe()

count       84174
unique          3
top       Covered
freq        51454
Name: parking_type, dtype: object

In [93]:
df.to_csv("house_prices_numeric_cleaned.csv", index=False)