In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/RealEstate_price.csv')

In [None]:
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(r'[^a-zA-Z0-9_]', '', regex=True)

In [None]:
df.head()

Unnamed: 0,Home,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood,Sales_date
0,1,114300.0,1790,2.0,2,2,No,East,15-01-2021
1,2,114200.0,2030,4.0,2,3,No,East,21-09-2022
2,3,114800.0,1740,3.0,2,1,No,East,13-03-2022
3,4,94700.0,1980,3.0,2,3,No,East,31-08-2021
4,5,119800.0,2130,3.0,3,3,No,East,31-08-2021


In [None]:
print(df.columns)

Index(['Home', 'Price', 'SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Brick',
       'Neighborhood', 'Sales_date'],
      dtype='object')


In [None]:
# Check for missing values
print(df.isnull().sum())

Home            0
Price           4
SqFt            0
Bedrooms        4
Bathrooms       0
Offers          0
Brick           0
Neighborhood    0
Sales_date      0
dtype: int64


In [None]:
# Example strategy: Fill numerical columns with median and categorical with mode
df['Price'].fillna(df['Price'].median(), inplace=True)
df['Bedrooms'].fillna(df['Bedrooms'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price'].fillna(df['Price'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bedrooms'].fillna(df['Bedrooms'].mode()[0], inplace=True)


In [None]:
print(df.isnull().sum())

Home            0
Price           0
SqFt            0
Bedrooms        0
Bathrooms       0
Offers          0
Brick           0
Neighborhood    0
Sales_date      0
dtype: int64


In [None]:
df['Sales_date'] = pd.to_datetime(df['Sales_date'], format='%d-%m-%Y')

In [None]:
# Filter for properties sold after January 2021 and located in the 'East' neighborhood
filtered_df = df[(df['Sales_date'] >= '2021-01-01') & (df['Neighborhood'] == 'East')]

# Display the filtered data
print(filtered_df.head())

   Home     Price  SqFt  Bedrooms  Bathrooms  Offers Brick Neighborhood  \
0     1  114300.0  1790       2.0          2       2    No         East   
1     2  114200.0  2030       4.0          2       3    No         East   
2     3  114800.0  1740       3.0          2       1    No         East   
3     4   94700.0  1980       3.0          2       3    No         East   
4     5  119800.0  2130       3.0          3       3    No         East   

  Sales_date  
0 2021-01-15  
1 2022-09-21  
2 2022-03-13  
3 2021-08-31  
4 2021-08-31  


In [None]:
# One-hot encoding for the 'neighborhood' column
df_encoded = pd.get_dummies(df, columns=['Neighborhood'], drop_first=True)

# For binary columns like 'brick', label encoding might be used
df_encoded['Brick'] = df_encoded['Brick'].apply(lambda x: 1 if x == 'Yes' else 0)


In [None]:
# Group by neighborhood and calculate average price
average_price_by_neighborhood = df.groupby('Neighborhood')['Price'].mean()

# Display the result
print(average_price_by_neighborhood)

Neighborhood
East     124680.000000
North    111131.818182
West     159294.871795
Name: Price, dtype: float64


In [None]:
# Using the IQR method to identify outliers for the 'price' column
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]

# Display the filtered data without outliers
print(df_no_outliers.head())


   Home     Price  SqFt  Bedrooms  Bathrooms  Offers Brick Neighborhood  \
0     1  114300.0  1790       2.0          2       2    No         East   
1     2  114200.0  2030       4.0          2       3    No         East   
2     3  114800.0  1740       3.0          2       1    No         East   
3     4   94700.0  1980       3.0          2       3    No         East   
4     5  119800.0  2130       3.0          3       3    No         East   

  Sales_date  
0 2021-01-15  
1 2022-09-21  
2 2022-03-13  
3 2021-08-31  
4 2021-08-31  
