In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Data cleaning and preprocessing functions
def parse_amount(amount_str):
    amount_str = str(amount_str).strip().lower().replace(" ", "")
    if 'lac' in amount_str:
        num_part = amount_str.replace('lac', '')
        try:
            return float(num_part) * 100000
        except ValueError:
            return np.nan
    elif 'cr' in amount_str:
        num_part = amount_str.replace('cr', '')
        try:
            return float(num_part) * 10000000
        except ValueError:
            return np.nan
    else:
        try:
            return float(amount_str)
        except ValueError:
            return np.nan

CONVERSION_FACTORS = {
    'sqft': 1,
    'sqyrd': 9,
    'sqm': 10.764,
    'kanal': 4500,
    'acre': 43560,
    'marla': 272.25,
    'hectare': 107639.104
}

def convert_area_to_sqft(area_str):
    if pd.isna(area_str) or not isinstance(area_str, str) or not area_str.strip():
        return np.nan
    area_str_clean = area_str.strip().lower()
    match = re.match(r'(\\d+\\.?\\d*)\\s*([a-z.]+)', area_str_clean)
    if match:
        value = float(match.group(1))
        unit = match.group(2).replace('.', '')
        if unit in CONVERSION_FACTORS:
            return value * CONVERSION_FACTORS[unit]
        else:
            return np.nan
    else:
        try:
            return float(area_str_clean)
        except ValueError:
            return np.nan

def parse_parking_details(parking_str):
    parking_type = 'Not Available'
    if pd.isna(parking_str) or not isinstance(parking_str, str) or not parking_str.strip():
        return parking_type
    parking_str = parking_str.strip().lower().replace(',', '')
    if parking_str == 'not available':
        return 'Not Available'
    match = re.search(r'(\\d+)\\s*(covered|open)', parking_str)
    if match:
        type_str = match.group(2)
        return type_str.capitalize()
    return 'Not Available'

def parse_floor_info(floor_str):
    flat_floor = np.nan
    total_floors = np.nan
    if pd.isna(floor_str) or not isinstance(floor_str, str) or not floor_str.strip():
        return flat_floor, total_floors
    floor_str = floor_str.strip().lower()
    match_num_out_of_num = re.match(r'(\\d+)\\s*out of\\s*(\\d+)', floor_str)
    if match_num_out_of_num:
        flat_floor = float(match_num_out_of_num.group(1))
        total_floors = float(match_num_out_of_num.group(2))
        return flat_floor, total_floors
    match_ground_out_of_num = re.match(r'ground\\s*out of\\s*(\\d+)', floor_str)
    if match_ground_out_of_num:
        flat_floor = 0.0
        total_floors = float(match_ground_out_of_num.group(1))
        return flat_floor, total_floors
    match_ub_out_of_num = re.match(r'upper basement\\s*out of\\s*(\\d+)', floor_str)
    if match_ub_out_of_num:
        flat_floor = -1.0
        total_floors = float(match_ub_out_of_num.group(1))
        return flat_floor, total_floors
    match_lb_out_of_num = re.match(r'lower basement\\s*out of\\s*(\\d+)', floor_str)
    if match_lb_out_of_num:
        flat_floor = -2.0
        total_floors = float(match_lb_out_of_num.group(1))
        return flat_floor, total_floors
    if floor_str in ['ground floor', 'ground', 'g']:
        flat_floor = 0.0
        return flat_floor, total_floors
    elif floor_str in ['first floor', 'first', '1st']:
        flat_floor = 1.0
        return flat_floor, total_floors
    elif floor_str in ['second floor', 'second', '2nd']:
        flat_floor = 2.0
        return flat_floor, total_floors
    elif floor_str in ['basement', 'b', 'upper basement', 'lower basement']:
        flat_floor = -1.0
        return flat_floor, total_floors
    if floor_str in ['top floor', 'penthouse', 'multiple floors', 'duplex', 'triplex']:
        return flat_floor, total_floors
    try:
        flat_floor = float(floor_str)
        return flat_floor, total_floors
    except ValueError:
        pass
    if '-' in floor_str:
        parts = floor_str.split('-')
        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
            flat_floor = (float(parts[0]) + float(parts[1])) / 2.0
            return flat_floor, total_floors
    return np.nan, np.nan

def clean_Bathroom_count(Bathroom_str):
    if pd.isna(Bathroom_str) or not isinstance(Bathroom_str, str):
        return np.nan
    Bathroom_str_lower = Bathroom_str.strip().lower()
    if Bathroom_str_lower == 'not available':
        return np.nan
    elif Bathroom_str_lower == '> 10':
        return 11.0
    else:
        try:
            return float(Bathroom_str_lower)
        except ValueError:
            return np.nan

def clean_balcony_count(balcony_str):
    if pd.isna(balcony_str) or not isinstance(balcony_str, str):
        return np.nan
    balcony_str_lower = balcony_str.strip().lower()
    if balcony_str_lower == 'not available':
        return np.nan
    elif balcony_str_lower == '> 10':
        return 11.0
    else:
        try:
            return float(balcony_str_lower)
        except ValueError:
            return np.nan

# Load and preprocess data
df = pd.read_csv('house price prediction/house_prices_small.csv')
df = df.rename(columns = {'Amount(in rupees)':'Amount','Price (in rupees)':'Price'})
df = df.drop(['Index','Dimensions','Plot Area','Society','Title','Description','Price','Super Area'],axis = 1)
df = df.dropna(subset=['Amount','Status','Floor','Furnishing','Bathroom','Transaction','Carpet Area'])
df['facing'] = df['facing'].fillna('Unknown')
df['overlooking'] = df['overlooking'].fillna('Not Available')
df['Ownership'] = df['Ownership'].fillna('Not Available')
df['Car Parking'] = df['Car Parking'].fillna('Not Available')
df['Balcony'] = df['Balcony'].fillna(0000)
df['Amount_Cleaned'] = df['Amount'].apply(parse_amount)
df['Amount'] = df['Amount_Cleaned']
df = df.drop(columns=['Amount_Cleaned'])
column_to_convert = ['Carpet Area']
for col in column_to_convert:
    df[f'{col}_sqft'] = df[col].apply(convert_area_to_sqft)
df = df.drop(columns=column_to_convert)
df['parking_type'] = df['Car Parking'].apply(parse_parking_details)
df = df.drop('Car Parking', axis=1)
df[['flat_floor', 'total_floors']] = df['Floor'].apply(lambda x: pd.Series(parse_floor_info(x)))
df = df.drop('Floor',axis=1)
df['overlooking'] = df['overlooking'].replace({'Pool, Garden/Park, Main Road':'Garden/Park, Pool, Main Road',
                                              'Main Road, Garden/Park':'Garden/Park, Main Road',
                                              'Main Road, Garden/Park, Pool':'Garden/Park, Pool, Main Road',
                                              'Pool, Garden/Park':'Garden/Park, Pool',
                                              'Garden/Park, Main Road, Pool':'Garden/Park, Pool, Main Road',
                                              'Main Road, Pool, Garden/Park':'Garden/Park, Pool, Main Road',
                                              'Pool, Main Road, Garden/Park':'Garden/Park, Pool, Main Road',
                                              'Main Road, Pool':'Pool, Main Road'})
df['Bathroom_numeric'] = df['Bathroom'].apply(clean_Bathroom_count)
df = df.drop('Bathroom',axis=1)
df['Balcony_numeric'] = df['Balcony'].apply(clean_balcony_count)
df = df.drop('Balcony',axis=1)
df = df.dropna(subset = ['Amount','Balcony_numeric','total_floors','Carpet Area_sqft'])
df = df.rename(columns={'Bathroom_numeric':'Bathrooms','Balcony_numeric':'Balcony','flat_floor':'Flat_floor','total_floors':'Total_floors',
                       'location':'Location','facing':'Facing','Carpet Area_sqft':'Carpet_Area','parking_type':'Parking_type'})

columns_to_check = ['Carpet_Area', 'Flat_floor', 'Total_floors', 'Bathrooms', 'Balcony']
non_outlier_mask = pd.Series(True, index=df.index)
for col in columns_to_check:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    current_col_outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
    non_outlier_mask = non_outlier_mask & (~current_col_outliers_mask)
df = df[non_outlier_mask].copy()

df_encoded = pd.get_dummies(df, columns=[
    'Location',
    'Transaction',
    'Furnishing',
    'Facing',
    'overlooking',
    'Ownership',
    'Parking_type'
], drop_first=False)
df_encoded = df_encoded.drop('Status', axis=1)

# Save the preprocessed data
df_encoded.to_csv('preprocessed_data.csv', index=False)

# Train the model
X = df_encoded.drop(columns='Amount')
y = df_encoded['Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Save the model
model.save('house_price_model.h5')
