### Importamos las librerias que usaremos en el codigo

In [1]:
# Import the libraries to confirm they are installed 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

print("Workshop is clean. Tools are sharp. Ready to work.")


Workshop is clean. Tools are sharp. Ready to work.


## Primeros pasos
- Convertir a data frame y analizar los datos

In [2]:
# Define the path to the dataset
# Using a variable makes the code cleaner and easier to modify later
file_path = 'data/housing.csv'

# Load the dataset into a pandas DataFrame
# The variable 'df' (short for DataFrame) is a standard convention
housing_df = pd.read_csv(file_path)

# --- INITIAL INSPECTION ---

# 1. Look at the first 5 rows to understand the columns and data format
print("--- First 5 Rows (head) ---")
print(housing_df.head())

# 2. Get a concise summary of the DataFrame
# This shows column data types, non-null values, and memory usage. It's our best tool for spotting missing data.
print("\n--- DataFrame Info ---")
housing_df.info()

# 3. Generate descriptive statistics for numerical columns
# This includes count, mean, standard deviation, min, max, and quartiles. Great for spotting outliers.
print("\n--- Descriptive Statistics (describe) ---")
print(housing_df.describe())

--- First 5 Rows (head) ---
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

--- DataFrame Info ---
<class 'pandas.core.fra

## Procedemos a la limpieza de datos, HANDLING MISSING VALUES



In [3]:

# 1. Calculate the median of the 'total_bedrooms' column.
# We choose the median because it is robust to outliers, which we suspect exist in this column.
median_total_bedrooms = housing_df['total_bedrooms'].median()
print(f"The calculated median for total_bedrooms is: {median_total_bedrooms}")

# 2. Impute (fill) the missing values in the 'total_bedrooms' column with the median.
# The 'inplace=True' argument modifies the DataFrame directly, without needing to reassign it.
# e.g., housing_df = housing_df.fillna(...)
housing_df['total_bedrooms'].fillna(median_total_bedrooms, inplace=True)

# --- VERIFICATION ---

# 3. Run .info() again to confirm that 'total_bedrooms' now has no missing values.
# The count for total_bedrooms should now match the other columns (20640 entries).
print("\n--- DataFrame Info After Imputation ---")
housing_df.info()

The calculated median for total_bedrooms is: 435.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing_df['total_bedrooms'].fillna(median_total_bedrooms, inplace=True)



--- DataFrame Info After Imputation ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
