In [176]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/dataset.csv")

In [177]:
import re


def convert_to_sqft(string: str):
    """
    Convert string to sq feet
    If the string contain a unit is not in the switcher, return NaN
    :param string: string to convert to the corresponding sq feet
    :return: sq feet as integer
    """
    # Split string into number and unit, e.g. '30Acres' -> ['', '30', 'Acres']
    # The first element is empty because the first char is not a digit
    # Regex pattern: (\d+(?:\.\d+)?) -> match a number with or without decimal point
    pattern = r'(\d+(?:\.\d+)?)'

    number = re.split(pattern, string)[1]
    unit = re.split(pattern, string)[2]

    # If no unit, return number
    if unit == '':
        return number

    # Convert unit to sq feet
    switcher = {
        'Sq. Meter': round(float(number) * 10.7639),
        'Sq. Yards': round(float(number) * 9),
        'Sq. Feet': number,
        'Acres': round(float(number) * 43560),
        'Perch': round(float(number) * 272.25),
        'Cents': round(float(number) * 435.6),
        'Guntha': round(float(number) * 1089),
        'Grounds': round(float(number) * 2400),
    }

    if unit != ' - ':
        print('[NUMBER]: ', number)
        print('[UNIT]: ', unit)
        print('[RETURN]', switcher.get(unit, 'Invalid unit'))

    return switcher.get(unit, np.nan)


# Example
print("[Sq Meter => Sq feet]: ", convert_to_sqft('1900Sq. Meter'))
print("[Sq Meter => Sq feet]: ", convert_to_sqft('361.33Sq. Yards'))
print("[Return number]: ", convert_to_sqft('1900'))
print("[Return number]: ", convert_to_sqft('1900 - 8300'))


[NUMBER]:  1900
[UNIT]:  Sq. Meter
[RETURN] 20451
[Sq Meter => Sq feet]:  20451
[NUMBER]:  361.33
[UNIT]:  Sq. Yards
[RETURN] 3252
[Sq Meter => Sq feet]:  3252
[Return number]:  1900
[Return number]:  nan


In [178]:
# Convert total_sqft to sqft
# Convert all data to corresponding float, or if not possible, NaN
df.total_sqft = df.total_sqft.map(lambda x: convert_to_sqft(x))

# Calculate percentage of NaN
print(df.isnull().sum() / len(df) * 100)

# Display DataFrame
df

[NUMBER]:  34.46
[UNIT]:  Sq. Meter
[RETURN] 371
[NUMBER]:  4125
[UNIT]:  Perch
[RETURN] 1123031
[NUMBER]:  1000
[UNIT]:  Sq. Meter
[RETURN] 10764
[NUMBER]:  1100
[UNIT]:  Sq. Yards
[RETURN] 9900
[NUMBER]:  5.31
[UNIT]:  Acres
[RETURN] 231304
[NUMBER]:  30
[UNIT]:  Acres
[RETURN] 1306800
[NUMBER]:  716
[UNIT]:  Sq. Meter
[RETURN] 7707
[NUMBER]:  1500
[UNIT]:  Sq. Meter
[RETURN] 16146
[NUMBER]:  142.61
[UNIT]:  Sq. Meter
[RETURN] 1535
[NUMBER]:  1574
[UNIT]:  Sq. Yards
[RETURN] 14166
[NUMBER]:  361.33
[UNIT]:  Sq. Yards
[RETURN] 3252
[NUMBER]:  117
[UNIT]:  Sq. Yards
[RETURN] 1053
[NUMBER]:  3040
[UNIT]:  Sq. Meter
[RETURN] 32722
[NUMBER]:  500
[UNIT]:  Sq. Yards
[RETURN] 4500
[NUMBER]:  167
[UNIT]:  Sq. Meter
[RETURN] 1798
[NUMBER]:  315
[UNIT]:  Sq. Yards
[RETURN] 2835
[NUMBER]:  3
[UNIT]:  Cents
[RETURN] 1307
[NUMBER]:  188.89
[UNIT]:  Sq. Yards
[RETURN] 1700
[NUMBER]:  204
[UNIT]:  Sq. Meter
[RETURN] 2196
[NUMBER]:  45
[UNIT]:  Sq. Yards
[RETURN] 405
[NUMBER]:  133.3
[UNIT]:  Sq. Ya

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00
