# Linear Regresion Model: Analysis of house prices


In [6]:
# IMPORT LIBS
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### LOAD THE DATASET
we first load the dataset and make the next moves:
- `check null values`: this is really important to know which columns wont give us enough information
- `drop columns with high null-values rate`: if the null value rate is bigger than 5%, we drop the collumn because it wont give us enough information 

In [7]:
# LOAD DATASET
df = pd.read_csv("csv_files/01_original_data.csv")

In [13]:
# SEE PROPORTION OF NULL VALUES FOR EACH COLUMN THAT CONTAINS AT LEAST 1% NULL
null_percent = df.isnull().mean() * 100
null_columns = null_percent[null_percent > 0]
print(null_columns)

MasVnrArea      0.547945
BsmtQual        2.534247
BsmtCond        2.534247
BsmtExposure    2.602740
BsmtFinType1    2.534247
BsmtFinType2    2.602740
Electrical      0.068493
dtype: float64


In [12]:
# TAKE COLUMNS WITH MORE THAN 5% NULL RATE AND SET NULL=0
cols_to_fill = df.columns[df.isnull().mean() >= 0.05]
df[cols_to_fill] = df[cols_to_fill].fillna(0)

### CLEAN THE DATASET
we need to take the data and transform it to a cleaner version, the final product must have a full numerical value dataset so that the model can understand it
- `binary values`: yes/no to 1/0
- `one-hot encoding`: nominal values will be replaced by similar binary values by the one-hot encoding method
- `replace remaining nulls with mean`: the remaining nulls we did not erase are going to be replaced by means

In [14]:
# BINARY VALUES
binary_map = {'Y': 1, 'N': 0}
df['CentralAir'] = df['CentralAir'].map(binary_map)
df['PavedDrive'] = df['PavedDrive'].map(binary_map)

In [15]:
# ONE-HOT ENCODING
df = pd.get_dummies(df, drop_first=True)

bool_columns = [col for col in df.columns if df[col].apply(lambda x: str(x) in ['True', 'False']).all()]
for col in bool_columns:
    df[col] = df[col].apply(lambda x: 1 if str(x) == 'True' or x is True else 0)

In [16]:
# REPLACE NULLS WITH MEAN
numerical_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in bool_columns]
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

In [None]:
# VERIFY
print(df.head())

### CORRELATION MATRIX
we now have a load of collumns, lets see which ones are the ones worth keeping
- `100% data`: too much data, unclear conclussions
- `15% data`: very high correlation with price, can give us clear conclussions. 31 collumns remaining

In [None]:
# SEE GENERAL CORRELATION MATRIX
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Matrix")
plt.show()

# too much data, nothing usefull to be seen here

In [None]:
# TAKE HIGHEST 20% OF VARIABLES WITH HIGHER CORRELATION TO PRICE
corr_target = corr_matrix["SalePrice"].abs().sort_values(ascending=False)
selected_features = corr_target[:int(len(corr_target) * 1)].index.tolist()
df_filtered = df[selected_features]

plt.figure(figsize=(16, 14))
sns.heatmap(df_filtered.corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Matrix of Selected Features")
plt.show()

In [None]:
# VERIFY
print(df_filtered.head())

In [None]:
# EXPORT TO CSV
df_filtered.to_csv("csv_files/02_filtered_data.csv", index=False)