In [2]:
import pandas as pd
import datetime

# Read the data from file using read_csv
df = pd.read_csv("../data/dataset.csv")

### Q1

# Used to test regular expressions
import re

# Convert the date column "availability" to datetime
# If the value does not contain any digits then set the date to 01-01-2023, 
# (because if the value does not contain any digits then it is a string like "Ready To Move" or "Immediate Possession")
# For each date with a missing year, set the year to 2023
df.availability = df.availability.map(
    lambda availability_date:
    # With f"""{availability_date}-2023""", we are adding the year 2023 to the date, because in our dataset the year is missing.
    datetime.datetime.strptime(f"""{availability_date}-2023""", "%d-%b-%Y")

    # Check the value of availability_date, if it contains any digits then return the date, else return the date 01-01-2023
    if re.search(r"\d+", availability_date)
    else datetime.datetime.strptime("01-01-2023", "%d-%m-%Y")
)

### Q2

# Remove rows with zero values in size
# Because otherwise we will not be able to train a model
df["has_null"] = df['size'].isnull()

# We calculate the % of null values
print(round(100*df["has_null"].sum() / len(df), 2), "% Null values")

print(len(df), "rows before cleaning")

df = df[~df.has_null]
print(len(df), "rows after cleaning")

# We make the separation to retrieve the label in 'type_of_size'
df['type_of_size'] = df['size'].str.replace('\d+','').str.strip()

#We will now separate the number from the text to only retrieve the number and replace it in the 'size' column
df['size'] = df['size'].str.extract('(\d+)', expand=False).astype(int)

### Q3

#  errors='coerce': replace non-convertible values with NaN
df['total_sqft'] = pd.to_numeric(df['total_sqft'], errors='coerce')

# # Displaying converted values
# print("Converted values :\n", df.loc[~df['total_sqft'].isna(), 'total_sqft'])

# # Displaying invalid values replaced by NaN
# print("\nInvalid values replaced by NaN :\n", df.loc[df['total_sqft'].isna(), 'total_sqft'])

### Q4
df.info()

0.12 % Null values
13320 rows before cleaning
13304 rows after cleaning
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13304 entries, 0 to 13319
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   area_type     13304 non-null  object        
 1   availability  13304 non-null  datetime64[ns]
 2   location      13303 non-null  object        
 3   size          13304 non-null  int32         
 4   society       7805 non-null   object        
 5   total_sqft    13065 non-null  float64       
 6   bath          13247 non-null  float64       
 7   balcony       12711 non-null  float64       
 8   price         13304 non-null  float64       
 9   has_null      13304 non-null  bool          
 10  type_of_size  13304 non-null  object        
dtypes: bool(1), datetime64[ns](1), float64(4), int32(1), object(4)
memory usage: 1.1+ MB


  df['type_of_size'] = df['size'].str.replace('\d+','').str.strip()
