In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
dt = pd.read_csv('../data/nam_processed_data.csv')

In [4]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    for index in df.index:
        price = df.loc[index, 'price']
        price_unit = df.loc[index, 'price_unit']

        # Process price
        if price_unit == 'Cr':
            df.loc[index, 'price_in_USD'] = (price * 10000000 * 0.012)
        elif price_unit == 'L':
            df.loc[index, 'price_in_USD'] = (price * 100000 * 0.012)

        # Process type  ( Apartment: 0, Independent House: 1, Studio Apartment: 2, villa: 3 )
        if df.loc[index, 'type'] == "Apartment":
            df.loc[index, 'house_type'] = int(0)
        elif df.loc[index, 'type'] == "Independent House":
            df.loc[index, 'house_type'] = int(1)
        elif df.loc[index, 'type'] == "Studio Apartment":
            df.loc[index, 'house_type'] = int(2)
        else:
            df.loc[index, 'house_type'] = int(3)

        # Process status ( Ready to move: 0, under Construction: 1 )
        if df.loc[index, 'status'] == "Ready to move":
            df.loc[index, 'status'] = 0
        else:
            df.loc[index, 'status'] = 1

    # Set status to int
    df['status'] = df['status'].astype(int)

    # Set house type to int
    df['house_type'] = df['house_type'].astype(int)

    # Drop unnecessary columns
    df.drop(['price_unit', 'price', 'locality', 'type'], axis = 1, inplace = True)

    # Drop rows that have unknown value
    df = df.replace("Unknown", value = np.NaN)
    df = df.dropna()

    # df = pd.concat([df, dummies.drop('other', axis='columns')], axis='columns')

    # print((type(df.loc[1, 'house_type'])))
    # print(df.head(10)[['region']])
    # print(list(df.columns))
    # print(df.shape)
    return df

def remove_std_outliers(df) -> pd.DataFrame:
    # Add column price per square feet to process data
    df['price_per_sqft'] = df['price_in_USD'] / df['area']
    result_df = pd.DataFrame()
    for index, row in df.groupby('region'):
        mn = np.mean(row['price_per_sqft'])
        std = np.std(row['price_per_sqft'])
        accpt_dt = row[(row['price_per_sqft'] > (mn - std)) & (row['price_per_sqft'] <= (mn + std))]
        result_df = pd.concat([result_df, accpt_dt], ignore_index = True)
    result_df = result_df.drop(['price_per_sqft'], axis = 1)
    return result_df


def process_region(df) -> pd.DataFrame:
    df.region = df.region.apply(lambda x: x.strip())
    region_stats = df['region'].value_counts(ascending=False)
    region_stats_less_than_10 = region_stats[region_stats <= 10]
    df['region'] = df['region'].apply(lambda x: 'other' if x in region_stats_less_than_10 else x)
    return df


def process_age(df) -> pd.DataFrame:
    # Process new column ( if 1 then that house is new else it is resale)
    df['new'] = df.age.apply(lambda x: 1 if x == "New" else 0)
    df = df.drop(['age'], axis = 1)
    idx = df[df['region'] == 'other'].index
    df = df.drop(idx)
    return df


# Convert DataFrane to csv file
def insert_data_nam(df):
    df.to_csv('../data/nam_processed_data.csv', index = False)

In [5]:
def apply_one_hot_encoding(df):
    # Applying one-hot encoding to the 'region' column
    dummies = pd.get_dummies(df['region'], prefix='region')
    # Concatenate the original DataFrame and the dummies DataFrame
    df = pd.concat([df, dummies], axis=1)
    # Optionally, drop the original 'region' column as it's now encoded
    df.drop(['region'], axis=1, inplace=True)
    return df


In [6]:
if __name__ == '__main__':
    dt = preprocess(dt)
    dt = process_region(dt)
    dt = remove_std_outliers(dt)
    dt = process_age(dt)
    insert_data_nam(dt)
    print(dt.shape)
    dt.info()
    print(dt.head())

(45343, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45343 entries, 0 to 45342
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bhk           45343 non-null  int64  
 1   area          45343 non-null  int64  
 2   region        45343 non-null  object 
 3   status        45343 non-null  int32  
 4   price_in_USD  45343 non-null  float64
 5   house_type    45343 non-null  int32  
 6   new           45343 non-null  int64  
dtypes: float64(1), int32(2), int64(3), object(1)
memory usage: 2.1+ MB
   bhk  area    region  status  price_in_USD  house_type  new
0    2   650  Agripada       1      314400.0           0    0
1    1   803  Agripada       0      270000.0           0    0
2    1   684  Agripada       0      204000.0           0    0
3    1   545  Agripada       0      192000.0           0    0
4    2  1650  Agripada       0      420000.0           0    0


In [7]:
dt = apply_one_hot_encoding(dt)

In [8]:
print(dt.head())

   bhk  area  status  price_in_USD  house_type  new  region_Agripada  \
0    2   650       1      314400.0           0    0             True   
1    1   803       0      270000.0           0    0             True   
2    1   684       0      204000.0           0    0             True   
3    1   545       0      192000.0           0    0             True   
4    2  1650       0      420000.0           0    0             True   

   region_Airoli  region_Ambernath East  region_Ambernath West  ...  \
0          False                  False                  False  ...   
1          False                  False                  False  ...   
2          False                  False                  False  ...   
3          False                  False                  False  ...   
4          False                  False                  False  ...   

   region_Vangani  region_Vasai  region_Vashi  region_Vikhroli  \
0           False         False         False            False   
1       