# Practice Set

# Mastering Data Cleaning and Data Preparation

NAME: Harshita Pandey

## Load dataset

In [1]:
import pandas as pd

In [2]:
df = pd. read_csv("C:/Users/hp/Downloads/Real_estate Dataset - real_estate_data.csv")

In [3]:
#Displaying 5 rows

df.head(5)

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0
1,P002,Chennai,Apartment,5400000.0,1828.0,
2,P003,Hyderabad,Plot,,897.0,3.0
3,P004,Delhi,Studio,7500000.0,,2.0
4,P005,Delhi,Apartment,8900000.0,2906.0,2.0


## Check Missing Values

In [5]:
missing_values = df.isnull().sum()
missing_values

Property_ID      0
Location         0
Property_Type    0
Price            7
Size_sqft        7
Bedrooms         7
dtype: int64

## Handle Missing Values

In [8]:
mean_bedrooms = df['Bedrooms'].mean()
mean_bedrooms

np.float64(2.6)

In [9]:
median_bedrooms = df['Bedrooms'].median()
median_bedrooms

np.float64(3.0)

In [13]:
mode_bedrooms = df['Bedrooms'].mode()
mode_bedrooms

0    3.0
Name: Bedrooms, dtype: float64

In [21]:
# Fill missing Bedrooms with median

df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace=True)
df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace=True)


Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0
1,P002,Chennai,Apartment,5400000.0,1828.0,3.0
2,P003,Hyderabad,Plot,,897.0,3.0
3,P004,Delhi,Studio,7500000.0,,2.0
4,P005,Delhi,Apartment,8900000.0,2906.0,2.0
5,P006,Delhi,Apartment,3600000.0,900.0,3.0
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
7,P008,Pune,Plot,,2029.0,4.0
8,P009,Mumbai,Villa,7300000.0,1600.0,3.0
9,P010,Pune,Apartment,4300000.0,1246.0,1.0


In [26]:
mean_size = df['Size_sqft'].mean()
mean_size

np.float64(1536.0192307692307)

In [27]:
median_size = df['Size_sqft'].median()
median_size

np.float64(1433.0)

In [23]:
# Fill missing Size_sqft with median

df['Size_sqft'].fillna(df['Size_sqft'].median(), inplace=True)
df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size_sqft'].fillna(df['Size_sqft'].median(), inplace=True)


Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0
1,P002,Chennai,Apartment,5400000.0,1828.0,3.0
2,P003,Hyderabad,Plot,,897.0,3.0
3,P004,Delhi,Studio,7500000.0,1433.0,2.0
4,P005,Delhi,Apartment,8900000.0,2906.0,2.0
5,P006,Delhi,Apartment,3600000.0,900.0,3.0
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
7,P008,Pune,Plot,,2029.0,4.0
8,P009,Mumbai,Villa,7300000.0,1600.0,3.0
9,P010,Pune,Apartment,4300000.0,1246.0,1.0


In [28]:
# Check if missing values remain in these columns

print(df[['Bedrooms', 'Size_sqft']].isnull().sum())

Bedrooms     0
Size_sqft    0
dtype: int64


## Handle Missing Price

In [34]:
#To handle missing Price values, drop all rows where the Price column has missing (NaN) values.

# Drop rows where Price is missing
df = df.dropna(subset=['Price'])
df

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0
1,P002,Chennai,Apartment,5400000.0,1828.0,3.0
3,P004,Delhi,Studio,7500000.0,1433.0,2.0
4,P005,Delhi,Apartment,8900000.0,2906.0,2.0
5,P006,Delhi,Apartment,3600000.0,900.0,3.0
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
8,P009,Mumbai,Villa,7300000.0,1600.0,3.0
9,P010,Pune,Apartment,4300000.0,1246.0,1.0
10,P011,Hyderabad,Villa,5400000.0,2825.0,3.0
11,P012,Chennai,Apartment,4700000.0,2766.0,3.0


In [32]:
# Check how many rows remain

print("Number of rows remaining after dropping missing Price:", len(df))

Number of rows remaining after dropping missing Price: 45


## Remove Duplicate Records

In [35]:
# Check for duplicate rows

duplicates = df.duplicated().sum()
print("Number of duplicate rows found:", duplicates)

Number of duplicate rows found: 2


In [40]:
# Remove duplicate rows
df = df.drop_duplicates()
df

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0
1,P002,Chennai,Apartment,5400000.0,1828.0,3.0
3,P004,Delhi,Studio,7500000.0,1433.0,2.0
4,P005,Delhi,Apartment,8900000.0,2906.0,2.0
5,P006,Delhi,Apartment,3600000.0,900.0,3.0
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
8,P009,Mumbai,Villa,7300000.0,1600.0,3.0
9,P010,Pune,Apartment,4300000.0,1246.0,1.0
10,P011,Hyderabad,Villa,5400000.0,2825.0,3.0
11,P012,Chennai,Apartment,4700000.0,2766.0,3.0


In [39]:
# Confirm total rows after removing duplicates
print("Total rows after removing duplicates:", len(df))

Total rows after removing duplicates: 43


## Statistical Summary

In [41]:
# Calculate statistics for Price

price_mean = df['Price'].mean()
price_median = df['Price'].median()
price_min = df['Price'].min()
price_max = df['Price'].max()

In [42]:
# Calculate statistics for Size_sqft
size_mean = df['Size_sqft'].mean()
size_median = df['Size_sqft'].median()
size_min = df['Size_sqft'].min()
size_max = df['Size_sqft'].max()

In [45]:
# Display results
print("Statistical Summary:")
print(f"Price  -> Mean: {price_mean:.2f}, Median: {price_median:.2f}, Min: {price_min}, Max: {price_max}")
print(f"Size_sqft -> Mean: {size_mean:.2f}, Median: {size_median:.2f}, Min: {size_min}, Max: {size_max}")

Statistical Summary:/n
Price  -> Mean: 8223255.81, Median: 8200000.00, Min: 3500000.0, Max: 14900000.0
Size_sqft -> Mean: 1528.91, Median: 1433.00, Min: 701.0, Max: 2906.0


## DataFrame from Series

In [47]:
# Create the new DataFrame

city_price_df = pd.DataFrame({
    "City": ["Delhi", "Mumbai", "Bangalore"],
    "Avg_Price": [80, 120, 95]
})

# Display the new DataFrame
city_price_df

Unnamed: 0,City,Avg_Price
0,Delhi,80
1,Mumbai,120
2,Bangalore,95


## Filter Data

In [48]:
mumbai_properties = df[(df['Location'] == 'Mumbai') & (df['Bedrooms'] > 2)]
mumbai_properties

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
8,P009,Mumbai,Villa,7300000.0,1600.0,3.0
39,P040,Mumbai,Villa,3800000.0,2893.0,3.0
47,P048,Mumbai,Villa,14900000.0,1616.0,4.0


## Conditional Filtering

In [50]:
high_value_properties = df[df['Price'] > 10000000]
high_value_properties

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
6,P007,Mumbai,Plot,11200000.0,2672.0,3.0
13,P014,Hyderabad,Studio,14100000.0,1826.0,3.0
15,P016,Chennai,Villa,13900000.0,1150.0,3.0
19,P020,Chennai,Plot,13800000.0,1433.0,3.0
22,P023,Pune,Villa,14500000.0,808.0,3.0
25,P026,Chennai,Plot,13000000.0,1433.0,4.0
27,P028,Pune,Studio,12300000.0,1672.0,2.0
28,P029,Bangalore,Apartment,10100000.0,1433.0,4.0
40,P041,Chennai,Apartment,11600000.0,2264.0,2.0
46,P047,Chennai,Studio,10200000.0,2286.0,2.0


## Merge DataFrames

In [51]:
# Merge main DataFrame with city average DataFrame
merged_df = pd.merge(df, city_price_df, left_on='Location', right_on='City', how='left')
merged_df

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms,City,Avg_Price
0,P001,Mumbai,Villa,9000000.0,1136.0,2.0,Mumbai,120.0
1,P002,Chennai,Apartment,5400000.0,1828.0,3.0,,
2,P004,Delhi,Studio,7500000.0,1433.0,2.0,Delhi,80.0
3,P005,Delhi,Apartment,8900000.0,2906.0,2.0,Delhi,80.0
4,P006,Delhi,Apartment,3600000.0,900.0,3.0,Delhi,80.0
5,P007,Mumbai,Plot,11200000.0,2672.0,3.0,Mumbai,120.0
6,P009,Mumbai,Villa,7300000.0,1600.0,3.0,Mumbai,120.0
7,P010,Pune,Apartment,4300000.0,1246.0,1.0,,
8,P011,Hyderabad,Villa,5400000.0,2825.0,3.0,,
9,P012,Chennai,Apartment,4700000.0,2766.0,3.0,,


In [52]:
# Compare actual Price with Avg_Price
merged_df['Price_vs_Avg'] = merged_df['Price'] / 100000  # Convert actual price to lakhs for fair comparison
merged_df['Difference_from_Avg'] = merged_df['Price_vs_Avg'] - merged_df['Avg_Price']

In [55]:
merged_df['Price_vs_Avg'].head()

0    90.0
1    54.0
2    75.0
3    89.0
4    36.0
Name: Price_vs_Avg, dtype: float64

In [56]:
merged_df['Difference_from_Avg'].head()

0   -30.0
1     NaN
2    -5.0
3     9.0
4   -44.0
Name: Difference_from_Avg, dtype: float64

In [57]:
# Display comparison columns

print(merged_df[['Property_ID', 'Location', 'Price_vs_Avg', 'Avg_Price', 'Difference_from_Avg']].head())

  Property_ID Location  Price_vs_Avg  Avg_Price  Difference_from_Avg
0        P001   Mumbai          90.0      120.0                -30.0
1        P002  Chennai          54.0        NaN                  NaN
2        P004    Delhi          75.0       80.0                 -5.0
3        P005    Delhi          89.0       80.0                  9.0
4        P006    Delhi          36.0       80.0                -44.0


## 

## Group Analysis

In [58]:
avg_price_by_type = df.groupby('Property_Type')['Price'].mean().reset_index()
avg_price_by_type

Unnamed: 0,Property_Type,Price
0,Apartment,6807143.0
1,Plot,11240000.0
2,Studio,9188889.0
3,Villa,7960000.0


## Sorting

In [59]:
# Sort the DataFrame by Price in descending order
sorted_df = df.sort_values(by='Price', ascending=False)
sorted_df.head()

Unnamed: 0,Property_ID,Location,Property_Type,Price,Size_sqft,Bedrooms
47,P048,Mumbai,Villa,14900000.0,1616.0,4.0
22,P023,Pune,Villa,14500000.0,808.0,3.0
13,P014,Hyderabad,Studio,14100000.0,1826.0,3.0
15,P016,Chennai,Villa,13900000.0,1150.0,3.0
19,P020,Chennai,Plot,13800000.0,1433.0,3.0
