In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("RealEstate_Prices.csv")

In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Property ID               100 non-null    int64  
 1   Property Type             100 non-null    object 
 2   Location                  100 non-null    object 
 3   Bedrooms                  100 non-null    int64  
 4   Bathrooms                 100 non-null    int64  
 5   Square Feet               100 non-null    int64  
 6   Year Built                80 non-null     float64
 7   Sale Price                90 non-null     float64
 8   Neighborhood              100 non-null    object 
 9   Amenities Distance Miles  100 non-null    float64
dtypes: float64(3), int64(4), object(3)
memory usage: 7.9+ KB


In [4]:
df.head()

Unnamed: 0,Property ID,Property Type,Location,Bedrooms,Bathrooms,Square Feet,Year Built,Sale Price,Neighborhood,Amenities Distance Miles
0,1,House,Downtown,2,3,1148,2009.0,807870.0,Neighborhood A,4.745561
1,2,Apartment,Suburb,5,1,2663,2019.0,776389.0,Neighborhood C,1.453618
2,3,House,Downtown,4,3,2957,,,Neighborhood A,2.480775
3,4,House,Rural,4,3,2721,2001.0,186148.0,Neighborhood C,0.141468
4,5,Apartment,Downtown,2,1,2780,2009.0,405378.0,Neighborhood B,0.305964


In [5]:
# Example of cleaning column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,property_id,property_type,location,bedrooms,bathrooms,square_feet,year_built,sale_price,neighborhood,amenities_distance_miles
0,1,House,Downtown,2,3,1148,2009.0,807870.0,Neighborhood A,4.745561
1,2,Apartment,Suburb,5,1,2663,2019.0,776389.0,Neighborhood C,1.453618
2,3,House,Downtown,4,3,2957,,,Neighborhood A,2.480775
3,4,House,Rural,4,3,2721,2001.0,186148.0,Neighborhood C,0.141468
4,5,Apartment,Downtown,2,1,2780,2009.0,405378.0,Neighborhood B,0.305964


In [6]:
df.isnull().sum()

property_id                  0
property_type                0
location                     0
bedrooms                     0
bathrooms                    0
square_feet                  0
year_built                  20
sale_price                  10
neighborhood                 0
amenities_distance_miles     0
dtype: int64

In [7]:
df.fillna(df.mean(), inplace=True)

  df.fillna(df.mean(), inplace=True)


In [8]:
df.isnull().sum()

property_id                 0
property_type               0
location                    0
bedrooms                    0
bathrooms                   0
square_feet                 0
year_built                  0
sale_price                  0
neighborhood                0
amenities_distance_miles    0
dtype: int64

In [9]:
filtered_df = df[(df['location'] == 'Downtown') & (df['bedrooms'] > 2)]
filtered_df

Unnamed: 0,property_id,property_type,location,bedrooms,bathrooms,square_feet,year_built,sale_price,neighborhood,amenities_distance_miles
2,3,House,Downtown,4,3,2957,2005.5,568244.222222,Neighborhood A,2.480775
9,10,House,Downtown,3,3,1375,2003.0,381195.0,Neighborhood A,3.55065
11,12,Apartment,Downtown,3,1,1697,2009.0,808658.0,Neighborhood C,2.967904
15,16,Apartment,Downtown,3,3,1938,2001.0,780931.0,Neighborhood C,1.068397
55,56,Condo,Downtown,4,1,2477,2005.5,651071.0,Neighborhood A,1.648236
62,63,House,Downtown,3,2,2139,2010.0,374938.0,Neighborhood A,0.996957
73,74,Condo,Downtown,3,1,1009,2019.0,551868.0,Neighborhood B,0.861997
74,75,Condo,Downtown,4,2,1297,1997.0,449542.0,Neighborhood B,1.601626
80,81,Apartment,Downtown,4,2,1851,2005.0,616370.0,Neighborhood B,0.280778
81,82,Condo,Downtown,4,2,1133,2009.0,201803.0,Neighborhood A,2.156767


In [10]:
df1 = pd.get_dummies(df, columns=['property_type', 'location', 'neighborhood'], drop_first=True)
df1

Unnamed: 0,property_id,bedrooms,bathrooms,square_feet,year_built,sale_price,amenities_distance_miles,property_type_Condo,property_type_House,location_Rural,location_Suburb,neighborhood_Neighborhood B,neighborhood_Neighborhood C
0,1,2,3,1148,2009.0,807870.000000,4.745561,0,1,0,0,0,0
1,2,5,1,2663,2019.0,776389.000000,1.453618,0,0,0,1,0,1
2,3,4,3,2957,2005.5,568244.222222,2.480775,0,1,0,0,0,0
3,4,4,3,2721,2001.0,186148.000000,0.141468,0,1,1,0,0,1
4,5,2,1,2780,2009.0,405378.000000,0.305964,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,5,1,1111,2013.0,626304.000000,1.954056,1,0,0,1,0,0
96,97,3,3,2654,1997.0,568244.222222,3.216269,0,0,1,0,1,0
97,98,3,2,2740,2006.0,262080.000000,0.187464,1,0,1,0,0,0
98,99,2,2,1233,2005.5,686411.000000,3.524208,1,0,0,0,1,0


In [11]:
average_sale_price_by_neighborhood = df.groupby('neighborhood')['sale_price'].mean()
average_sale_price_by_neighborhood

neighborhood
Neighborhood A    533672.207207
Neighborhood B    542188.843137
Neighborhood C    642901.030651
Name: sale_price, dtype: float64

In [12]:
def handle_outliers_with_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify and handle outliers
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

    return data, outliers

In [13]:
df, outliers = handle_outliers_with_iqr(df, 'sale_price')
outliers

Unnamed: 0,property_id,property_type,location,bedrooms,bathrooms,square_feet,year_built,sale_price,neighborhood,amenities_distance_miles
