The splitting the dataset on two: apartment_ds and house_ds
Explanation:
str.lower(): Ensures case insensitivity while filtering.
to_csv(): Saves each subset to a separate CSV file without the index column.
This will create two CSV files, house_dataset.csv and apartment_dataset.csv, each containing only the rows corresponding to their respective property types.

In [2]:
import pandas as pd

# Load the original dataset
df = pd.read_csv('cleaned_data.csv')

# Split the dataset into two separate datasets based on "Type_of_Property"
df_house = df[df['Type_of_Property'].str.lower() == 'house']
df_apartment = df[df['Type_of_Property'].str.lower() == 'apartment']

# Save the datasets into separate CSV files
df_house.to_csv('02_house_dataset.csv', index=False)
df_apartment.to_csv('01_apartment_dataset.csv', index=False)

print("Datasets split and saved as 'house_dataset_full.csv' and 'apartment_dataset_full.csv'.")


Datasets split and saved as 'house_dataset_full.csv' and 'apartment_dataset_full.csv'.


In [6]:
import pandas as pd


def clean_dataset_with_analysis(file_path, output_path, drop_threshold=0.5):
  
    try:
        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Analyze missing values
        print("Analyzing missing values...")
        missing_data = df.isnull().sum()
        missing_percentage = (missing_data / len(df)) * 100
        missing_info = pd.DataFrame({
            'Missing Values': missing_data,
            'Percentage': missing_percentage
        }).sort_values(by='Percentage', ascending=False)
        print(missing_info)
        
        # Drop columns with > drop_threshold missing data
        threshold = drop_threshold * len(df)
        columns_to_drop = missing_info[missing_info['Missing Values'] > threshold].index
        print(f"Dropping columns: {list(columns_to_drop)}")
        df = df.drop(columns=columns_to_drop)
        
        # Drop rows where critical columns have missing values
        critical_columns = ['Price', 'Type_of_Property', 'Locality']
        df = df.dropna(subset=critical_columns)
        
        # Fill missing values
        for col in df.select_dtypes(include=['float64', 'int64']).columns:
            df[col] = df[col].fillna(df[col].median())  # Median for numerical columns
        
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].fillna(df[col].mode()[0])  # Mode for categorical columns
        
        # Save the cleaned dataset
        df.to_csv(output_path, index=False)
        print(f"Cleaned dataset saved to {output_path}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
clean_dataset_with_analysis('01_appartment_dataset.csv', '01_appartment_cleaned.csv') # for appartment
#clean_dataset_with_analysis('02_house_dataset.csv', '02_houses_cleaned.csv') # for houses



Analyzing missing values...
                           Missing Values  Percentage
Surface_area_plot_of_land           10966  100.000000
Surface_of_the_Land                 10231   93.297465
Garden_Area                         10231   93.297465
Garden                              10231   93.297465
Terrace_Area                         5145   46.917746
Number_of_Facades                    4423   40.333759
Terrace                              3042   27.740288
State_of_the_Building                1790   16.323181
Living_Area                           373    3.401423
Open_fire                               0    0.000000
Locality                                0    0.000000
Furnished                               0    0.000000
Fully_Equipped_Kitchen                  0    0.000000
Number_of_Rooms                         0    0.000000
Subtype_of_Property                     0    0.000000
Type_of_Property                        0    0.000000
Swimming_Pool                           0    0.000000
