In [41]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.model_selection import LearningCurveDisplay, ShuffleSplit
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
import pingouin as pg

In [38]:
house_train_EDA=pd.read_csv('BBDD\\House_prices-Advanced_Regression_Techniques\\train.csv')
mssubclass_dict = {
    20: 'A',30: 'B',40: 'C',45: 'D',50: 'E',60: 'F',70: 'G',75: 'H',80: 'I',85: 'J',90: 'K',120: 'L',150: 'M',160: 'N',180: 'O',190: 'P'
}
house_train_EDA['MSSubClass']=house_train_EDA['MSSubClass'].map(mssubclass_dict)
house_train_EDA.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,F,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,A,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,F,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,G,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,F,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### EDA

In this project we have to work with a dataset from KAGGLE called ***House Prices - Advanced Regression Techniques***.
This data set has 80 features for each house and the selling price. In this first part of the project we will study how related are the different features with the final price of the house.

-For the numerical features, we will obtain the mutual information between the feature and the target variable.
-For categorical features we will use the Chi-square test or ANOVA test to determine if there is statistically significant relationship between each categorical feature and the final price.

Each feature will be analyzed after removing the NaN values. Once we have a first idea of how related is each feature with the final price, we will study wether to fill or to remove the NaN values of the dataset. We will consider that a feature is strongly related with the target variable if the p-value of the test is p<0.05.

In [39]:
house_train_EDA.drop(['Id'], axis=1, inplace=True) 

# Numerical features:
numerical_columns=house_train_EDA.select_dtypes(include=['number']).columns
print("Numerical columns:", numerical_columns,'  ', len(numerical_columns))

# Categorical features
categorical_columns = house_train_EDA.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_columns,'  ', len(categorical_columns))
print('total length:',len(numerical_columns)+len(categorical_columns))

Numerical columns: Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')    36
Categorical columns: Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinTyp

In [None]:
numerical_df =house_train_EDA.select_dtypes(include=['number'])
categorical_df = house_train_EDA.select_dtypes(include=['object', 'category', 'bool']) 
categorical_df['SalePrice']=house_train_EDA['SalePrice']

def compute_mutual_information_numerical_features(df, target_column):
    # Separate the target variable
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Initialize an empty dictionary to store mutual information scores
    mi_scores = {}

    # Loop over each feature and compute mutual information
    for column in X.columns:
            mi_score = mutual_info_regression(X[[column]], y)
            mi_scores[column] = mi_score[0]

    # Convert the mutual information scores to a pandas DataFrame for easier viewing
    mi_df = pd.DataFrame(list(mi_scores.items()), columns=['Feature', 'Mutual Information'])
    mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
    
    return mi_df

def compute_ANOVA_test_categorical_features(df, target_column):
    X= df.drop(columns=[target_column])
    y= df[target_column]
     # Initialize a dictionary to store the p-values
    pvalues = {}

    # Loop over each feature and compute the ANOVA p-value
    for column in X.columns:
        anova_result = pg.anova(data=df, dv=target_column, between=column)
        pvalues[column] = anova_result['p-unc'][0]

    # Convert the p-values dictionary into a DataFrame
    pvalues_df = pd.DataFrame(list(pvalues.items()), columns=['Feature', 'P-Value'])
    pvalues_df = pvalues_df.sort_values(by='P-Value', ascending=True)
    
    return pvalues_df

mutual_info=compute_mutual_information_numerical_features(numerical_df, 'SalePrice')
ANOVA_p_vals=compute_ANOVA_test_categorical_features(categorical_df, 'SalePrice')


mutual_info.head()

ValueError: Input X contains NaN.

In [None]:
ANOVA_p_vals.head()