In [41]:
print(df.columns)

Index(['Index', 'Title', 'Description', 'Amount(in rupees)',
       'Price (in rupees)', 'location', 'Carpet Area', 'Status', 'Floor',
       'Transaction', 'Furnishing', 'facing', 'overlooking', 'Society',
       'Bathroom', 'Balcony', 'Car Parking', 'Ownership', 'Super Area',
       'Dimensions', 'Plot Area'],
      dtype='object')


In [42]:
# Handle missing values for 'Status' (instead of 'Possession Status')
if 'Status' in df.columns:
    df['Status'] = df['Status'].fillna(df['Status'].mode()[0])

# Similarly, for other columns like 'Availability' (if it exists in your dataset)
# Check for other columns and fill missing values as required

# Handle missing values for other columns based on data type:
df['Carpet Area'] = df['Carpet Area'].fillna(df['Carpet Area'].median())
df['Price (in rupees)'] = df['Price (in rupees)'].fillna(df['Price (in rupees)'].median())
df['Amount(in rupees)'] = df['Amount(in rupees)'].fillna(df['Amount(in rupees)'].median())
df['Floor'] = df['Floor'].fillna(df['Floor'].mode()[0])  # Most frequent value for categorical data

# For categorical columns, fill missing values with the mode (most frequent value)
# For example, 'Furnishing' and 'facing' (if required)

df['Furnishing'] = df['Furnishing'].fillna(df['Furnishing'].mode()[0])
df['facing'] = df['facing'].fillna(df['facing'].mode()[0])

# Check the shape of the dataframe after filling missing values
print(df.shape)


(187531, 21)


In [44]:
# Remove non-numeric characters (like ' sqft') and convert the values to float
df['Super Area'] = df['Super Area'].replace(r'\D', '', regex=True).astype(float)

# Now, attempt to convert the other columns again
df['Carpet Area'] = df['Carpet Area'].astype(float)
df['Price (in rupees)'] = df['Price (in rupees)'].astype(float)
df['Amount(in rupees)'] = df['Amount(in rupees)'].astype(float)
df['Plot Area'] = df['Plot Area'].astype(float)


In [None]:
# Handle missing values for categorical columns
def fill_with_mode_or_default(column):
    mode_value = column.mode()
    if not mode_value.empty:
        return column.fillna(mode_value[0])
    else:
        return column.fillna('Unknown')  # or another appropriate default value

# Apply this function to categorical columns
df['Status'] = fill_with_mode_or_default(df['Status'])
df['Floor'] = fill_with_mode_or_default(df['Floor'])
df['Furnishing'] = fill_with_mode_or_default(df['Furnishing'])
df['facing'] = fill_with_mode_or_default(df['facing'])
df['Society'] = fill_with_mode_or_default(df['Society'])
df['Bathroom'] = fill_with_mode_or_default(df['Bathroom'])
df['Balcony'] = fill_with_mode_or_default(df['Balcony'])
df['Car Parking'] = fill_with_mode_or_default(df['Car Parking'])
df['Ownership'] = fill_with_mode_or_default(df['Ownership'])
df['Transaction'] = fill_with_mode_or_default(df['Transaction'])
df['Dimensions'] = fill_with_mode_or_default(df['Dimensions'])

# Check for any remaining missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


Description      3023
overlooking     81436
Plot Area      187531
dtype: int64


In [47]:
# Fill 'Description' with a placeholder
df['Description'] = df['Description'].fillna('No description available')

# Fill 'overlooking' with the mode (most frequent value)
df['overlooking'] = df['overlooking'].fillna(df['overlooking'].mode()[0])

# Fill 'Plot Area' with the median (or mean if more appropriate)
df['Plot Area'] = df['Plot Area'].fillna(df['Plot Area'].median())  # or use mean()

# Check for any remaining missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Plot Area    187531
dtype: int64


In [48]:
# Drop 'Plot Area' column since it contains only missing values
df = df.drop(columns=['Plot Area'])

# Check if any columns still have missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


Series([], dtype: int64)


In [49]:
# Drop duplicate rows if any
df = df.drop_duplicates()

# Check if any duplicates remain
print(df.duplicated().sum())


0


In [51]:
# Detecting outliers using IQR (Interquartile Range)
Q1 = df['Price (in rupees)'].quantile(0.25)
Q3 = df['Price (in rupees)'].quantile(0.75)
IQR = Q3 - Q1
outlier_threshold_lower = Q1 - 1.5 * IQR
outlier_threshold_upper = Q3 + 1.5 * IQR

# Filter out outliers
df = df[(df['Price (in rupees)'] >= outlier_threshold_lower) & (df['Price (in rupees)'] <= outlier_threshold_upper)]


In [52]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns to scale
numerical_columns = ['Price (in rupees)', 'Amount(in rupees)', 'Super Area', 'Carpet Area']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Verify the scaling
print(df[numerical_columns].head())



   Price (in rupees)  Amount(in rupees)  Super Area  Carpet Area
0          -0.099417           0.125157   -0.019607    -0.246424
3          -0.085880          -0.435147   -0.019607    -0.233863
5           0.146636           0.224034   -0.436424    -0.037074
6          -1.477794          -0.715299   -0.019607    -0.225489
7           1.666354           0.718420   -0.513206    -0.037074


In [53]:
# Perform one-hot encoding on categorical columns
df_encoded = pd.get_dummies(df, columns=['Ownership', 'Transaction', 'Furnishing', 'facing', 'Society', 'Bathroom', 'Balcony', 'Car Parking'], drop_first=True)

# Verify the encoding
print(df_encoded.head())


   Index                                              Title  \
0      0  1 BHK Ready to Occupy Flat for sale in Srushti...   
3      3        1 BHK Ready to Occupy Flat for sale Kasheli   
5      5  1 BHK Ready to Occupy Flat for sale in Virat A...   
6      6         1 BHK Ready to Occupy Flat for sale Mumbra   
7      7          1 BHK Ready to Occupy Flat for sale Kalwa   

                                         Description  Amount(in rupees)  \
0  Bhiwandi, Thane has an attractive 1 BHK Flat f...           0.125157   
3  This beautiful 1 BHK Flat is available for sal...          -0.435147   
5  Creatively planned and constructed is a 1 BHK ...           0.224034   
6  This magnificent 1 BHK Flat is available for s...          -0.715299   
7  Creatively planned and constructed is a 1 BHK ...           0.718420   

   Price (in rupees) location  Carpet Area         Status            Floor  \
0          -0.099417    thane    -0.246424  Ready to Move     10 out of 11   
3          -0.

In [54]:
# Define target and features
X = df_encoded.drop('Price (in rupees)', axis=1)  # Features
y = df_encoded['Price (in rupees)']  # Target

# Check the shapes
print(X.shape, y.shape)


(171490, 9906) (171490,)


In [55]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(137192, 9906) (34298, 9906) (137192,) (34298,)


In [58]:
print(df.columns)

Index(['Index', 'Title', 'Amount(in rupees)', 'Price (in rupees)', 'location',
       'Carpet Area', 'Status', 'Floor', 'Transaction', 'Furnishing', 'facing',
       'overlooking', 'Society', 'Bathroom', 'Balcony', 'Car Parking',
       'Ownership', 'Super Area', 'Dimensions'],
      dtype='object')


In [59]:
# Drop the 'Title' column if it's not useful for prediction
df = df.drop(columns=['Title'])

# Now, you can proceed with your model training


In [60]:
# Separate features (X) and target variable (y)
X = df.drop(columns=['Price (in rupees)'])  # Assuming 'Price (in rupees)' is your target variable
y = df['Price (in rupees)']

# Split the dataset into training and test sets (80% training, 20% testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(137192, 17) (34298, 17) (137192,) (34298,)


In [62]:
# Check the data types of each column
categorical_columns = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)


Categorical columns: Index(['location', 'Status', 'Floor', 'Transaction', 'Furnishing', 'facing',
       'overlooking', 'Society', 'Bathroom', 'Balcony', 'Car Parking',
       'Ownership', 'Dimensions'],
      dtype='object')


In [63]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
categorical_cols_to_encode = ['Status', 'Furnishing', 'Transaction', 'facing', 'Bathroom', 'Balcony', 'Car Parking']

for col in categorical_cols_to_encode:
    X[col] = label_encoder.fit_transform(X[col])

# Check if the encoding was successful
print(X.head())


   Index  Amount(in rupees) location  Carpet Area  Status            Floor  \
0      0           0.125157    thane    -0.246424       0     10 out of 11   
3      3          -0.435147    thane    -0.233863       0       1 out of 3   
5      5           0.224034    thane    -0.037074       0       2 out of 7   
6      6          -0.715299    thane    -0.225489       0       4 out of 5   
7      7           0.718420    thane    -0.037074       0  Ground out of 7   

   Transaction  Furnishing  facing             overlooking  \
0            3           2       0               Main Road   
3            3           2       0               Main Road   
5            3           2       0  Garden/Park, Main Road   
6            3           2       0               Main Road   
7            3           0       0               Main Road   

                               Society  Bathroom  Balcony  Car Parking  \
0  Srushti Siddhi Mangal Murti Complex         0        2            0   
3         

In [64]:
# Apply one-hot encoding to columns with many unique categories
X = pd.get_dummies(X, columns=['location', 'overlooking', 'Society', 'Ownership', 'Dimensions'], drop_first=True)

# Check the encoded dataframe
print(X.head())


   Index  Amount(in rupees)  Carpet Area  Status            Floor  \
0      0           0.125157    -0.246424       0     10 out of 11   
3      3          -0.435147    -0.233863       0       1 out of 3   
5      5           0.224034    -0.037074       0       2 out of 7   
6      6          -0.715299    -0.225489       0       4 out of 5   
7      7           0.718420    -0.037074       0  Ground out of 7   

   Transaction  Furnishing  facing  Bathroom  Balcony  ...  \
0            3           2       0         0        2  ...   
3            3           2       0         0        0  ...   
5            3           2       0         0        0  ...   
6            3           2       0         0        2  ...   
7            3           0       0         0        2  ...   

   Society_varsha meadows  Society_vijaya heritage  Society_vishwaketu 2  \
0                   False                    False                 False   
3                   False                    False          

In [66]:
# Check the unique values in the 'Bathroom' column (or other suspect columns)
print(X['Bathroom'].unique())


[ 0  3  2  4  1  5  6  7  8 10  9]


In [68]:
# Convert all values to string before extracting numbers
X['Bathroom'] = X['Bathroom'].astype(str).str.extract(r'(\d+)').astype(float)

# Check unique cleaned values
print(X['Bathroom'].unique())


[ 0.  3.  2.  4.  1.  5.  6.  7.  8. 10.  9.]


In [69]:
print(X.isnull().sum())


Index                          0
Amount(in rupees)              0
Carpet Area                    0
Status                         0
Floor                          0
                              ..
Society_yash tower             0
Society_yogi complex           0
Ownership_Freehold             0
Ownership_Leasehold            0
Ownership_Power Of Attorney    0
Length: 9747, dtype: int64


In [70]:
X = X.fillna(0)  # or X.dropna() if you want to drop missing rows


In [73]:
import re

def extract_floor(value):
    if isinstance(value, str):
        match = re.search(r'\d+', value)
        return int(match.group()) if match else None
    return value  # In case it's already numeric or NaN

X['Floor'] = X['Floor'].apply(extract_floor)


In [74]:
print(X['Floor'].unique())
print(X['Floor'].isna().sum())  # Check how many NaNs resulted


[ 10.   1.   2.   4.   7.   3.   6.  16.   5.  20.  27.  11.   9.  12.
  14.  21.  15.   8.  13.  17.  23.  24.  19.  18.  22.  50.  28.  25.
  26.  33.  34.  38.  30.  nan  31.  29.  40.  32.  35.  39. 200.]
5


In [75]:
X = X.dropna(subset=['Floor'])


In [77]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X_scaled = scaler.fit_transform(X[numeric_cols])


In [80]:
# First, align X and y to ensure they have the same index
X, y = X.align(y, join='inner', axis=0)

# Then scale only numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X_scaled = scaler.fit_transform(X[numeric_cols])


In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [86]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Linear Regression model
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lin_reg_model.predict(X_test)

# Evaluate the model
print("Linear Regression - Mean Squared Error:", mean_squared_error(y_test, y_pred_lr))
print("Linear Regression - R² Score:", r2_score(y_test, y_pred_lr))


Linear Regression - Mean Squared Error: 0.7631589271554344
Linear Regression - R² Score: 0.2387876371160076


In [85]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Decision Tree Regressor
dtr_model = DecisionTreeRegressor(random_state=42)
dtr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dtr = dtr_model.predict(X_test)

# Evaluate the model
print("DTR - Mean Squared Error:", mean_squared_error(y_test, y_pred_dtr))
print("DTR - R² Score:", r2_score(y_test, y_pred_dtr))


DTR - Mean Squared Error: 0.06992486248612542
DTR - R² Score: 0.9302534925512825
