# Kaggle Data Exploration

This notebook performs a comprehensive exploratory analysis of the data provided by **Kaggle**.

## Objectives
- Analyze the structure and quality of Kaggle data
- Identify trends and patterns in key variables
- Detect potential data issues or inconsistencies
- Generate informative visualizations
- Propose necessary transformations for modeling


In [14]:
!pip3 install git+https://github.com/ydataai/ydata-profiling.git


Collecting git+https://github.com/ydataai/ydata-profiling.git
  Cloning https://github.com/ydataai/ydata-profiling.git to /private/var/folders/7w/8rfjhby15g962r8x7qf54rd00000gn/T/pip-req-build-dr8fzqkm
  Running command git clone --filter=blob:none --quiet https://github.com/ydataai/ydata-profiling.git /private/var/folders/7w/8rfjhby15g962r8x7qf54rd00000gn/T/pip-req-build-dr8fzqkm
  Resolved https://github.com/ydataai/ydata-profiling.git to commit c3ce66ca4cc9564e013f71cbb96ed7b3d8fa5c72
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting matplotlib<=3.10,>=3.5 (from ydata-profiling==0.0.dev0)
  Downloading matplotlib-3.10.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling==0.0.dev0)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting numpy<2.2,>=1

In [1]:
# Environment Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings

# Configurations
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set(style="whitegrid", palette="muted", font_scale=1.1)


In [5]:

def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded data.
    """
    try:
        data = pd.read_csv(file_path)
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print(f"Error loading the data: {e}")
        return None

def clean_data(data):
    """
    Clean and preprocess the data.

    Args:
        data (pd.DataFrame): Raw data to be cleaned.

    Returns:
        pd.DataFrame: Cleaned data.
    """
    try:
        # Replace non-numeric values in 'bath_num' and 'room_num' columns with 0 and convert to float
        data['bath_num'] = data['bath_num'].replace('sin baños', '0').astype(float)
        data['room_num'] = data['room_num'].replace('sin habitación', '0').astype(float)

        # Convert 'garage' column to binary (0 if empty, 1 if not)
        data['garage'] = data['garage'].notna().astype(int)

        # Identify unique values in 'house_type' and create a mapping to integers
        house_type_values = data['house_type'].unique()
        house_type_mapping = {value: idx for idx, value in enumerate(house_type_values)}
        data['house_type'] = data['house_type'].map(house_type_mapping)

        # Drop unnecessary columns
        data_cleaned = data.drop(columns=['ground_size', 'kitchen', 'unfurnished', 'loc_street', 'ad_description'])

        # Handle missing values by filling with median for specific columns
        data_cleaned['construct_date'].fillna(data_cleaned['construct_date'].median(), inplace=True)
        data_cleaned['m2_useful'].fillna(data_cleaned['m2_useful'].median(), inplace=True)
        data_cleaned['lift'].fillna(data_cleaned['lift'].median(), inplace=True)

        # Use one-hot encoding for categorical columns
        categorical_columns = ['condition', 'heating', 'orientation']
        data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns)

        print("Data cleaned successfully.")
        return data_cleaned, house_type_mapping
    except Exception as e:
        print(f"Error cleaning the data: {e}")
        return None, None

def split_data(data_cleaned, house_type_mapping):
    """
    Split the cleaned data into two datasets based on the 'house_type' column.

    Args:
        data_cleaned (pd.DataFrame): Cleaned data.
        house_type_mapping (dict): Mapping of house types to integer values.

    Returns:
        tuple: Two DataFrames, one for 'alquiler' and one for others.
    """
    try:
        # Identify codes corresponding to any 'alquiler' types in house_type_mapping
        alquiler_codes = [
            code for key, code in house_type_mapping.items()
            if 'alquiler' in key.lower()
        ]

        if alquiler_codes:
            # Split data based on 'house_type'
            alquiler_data = data_cleaned[data_cleaned['house_type'].isin(alquiler_codes)]
            other_data = data_cleaned[~data_cleaned['house_type'].isin(alquiler_codes)]
            print("Data split successfully.")
            return alquiler_data, other_data
        else:
            print("No 'alquiler' types found in house_type.")
            return None, None
    except Exception as e:
        print(f"Error splitting the data: {e}")
        return None, None

def save_data(data, output_file_path):
    """
    Save the cleaned data to a CSV file.

    Args:
        data (pd.DataFrame): Data to be saved.
        output_file_path (str): Path to save the CSV file.
    """
    try:
        data.to_csv(output_file_path, index=False)
        print(f"Data saved successfully to {output_file_path}.")
    except Exception as e:
        print(f"Error saving the data: {e}")

def dict_to_dataframe(dictionary, df_name='Mapping'):
    """
    Convert a dictionary to a pandas DataFrame.

    Args:
        dictionary (dict): Dictionary to convert.
        df_name (str): Name for the index of the DataFrame.

    Returns:
        pd.DataFrame: DataFrame representation of the dictionary.
    """
    df = pd.DataFrame(list(dictionary.items()), columns=['House_Type', 'Code'])
    df.index.name = df_name
    return df

def DataCleaningSaving(file_path, output_file_path_rent, output_file_path_sale,output_file_path_mapping):
    """
    Main function to execute data loading, cleaning, splitting, and saving processes.
    """

    # Load the data
    data = load_data(file_path)
    if data is not None:
        # Clean the data
        cleaned_data, house_type_mapping = clean_data(data)
        if cleaned_data is not None:
            # Split the data into two datasets
            alquiler_data, other_data = split_data(cleaned_data, house_type_mapping)
            if alquiler_data is not None and other_data is not None:
                # Display the first few rows of each dataset
                print("Alquiler Data:")
                print(alquiler_data.head())

                print("\nOther Data:")
                print(other_data.head())

                mapping_df = dict_to_dataframe(house_type_mapping, 'House_Type_Mapping')

                # Save the split datasets to new CSV files
                save_data(alquiler_data, output_file_path_rent)
                save_data(other_data, output_file_path_sale)
                save_data(mapping_df, output_file_path_mapping)



In [7]:
file_path = '../data/raw/data-kaggle/houses_alava.csv'
output_file_path_rent = '../data/processed/houses_alava_cleaned_rent.csv'
output_file_path_sale = '../data/processed/houses_alava_cleaned_sale.csv'
output_file_path_mapping = '../data/processed/houses_type_mapping.csv'
DataCleaningSaving(file_path, output_file_path_rent, output_file_path_sale, output_file_path_mapping)

Data loaded successfully.
Data cleaned successfully.
Data split successfully.
Alquiler Data:
                          ad_last_update  air_conditioner  balcony  bath_num  \
3627  Anuncio actualizado el 29 de marzo                0        0      1.00   
3628       más de 2 meses sin actualizar                0        1      1.00   
3629       más de 2 meses sin actualizar                0        0      2.00   
3630       más de 2 meses sin actualizar                0        0      2.00   
3631  Anuncio actualizado el 21 de marzo                0        0      1.00   

      built_in_wardrobe  chimney  construct_date energetic_certif  \
3627                  0        0         1965.00      no indicado   
3628                  0        0         1993.00              NaN   
3629                  0        0         1993.00      no indicado   
3630                  0        0         1993.00      no indicado   
3631                  0        0         1993.00      no indicado   

           

In [None]:
from ydata_profiling import ProfileReport

def generate_profile_report(data, report_title="Data Profile Report"):
    """
    Generate and save a profile report for the given dataset using ydata_profiling.

    Args:
        data (pd.DataFrame): The dataset for which the profile report is to be generated.
        report_title (str): Title for the profile report.

    Returns:
        None
    """
    try:
        # Generate the profile report
        profile = ProfileReport(data, title=report_title)

        # Save the report to an HTML file
        report_file_path = f"{report_title.lower().replace(' ', '_')}.html"
        profile.to_file(report_file_path)

        print(f"Profile report generated and saved to {report_file_path}.")
    except Exception as e:
        print(f"Error generating profile report: {e}")


In [None]:
output_file_path_rent = 'houses_alava_cleaned_rent.csv'
generate_profile_report(load_data(output_file_path_rent), report_title="Rent Data Profile Report")

output_file_path_sale = 'houses_alava_cleaned_sale.csv'
generate_profile_report(load_data(output_file_path_sale), report_title="Sales Data Profile Report")