In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [3]:
dataset= pd.read_csv('train_HP.csv')

In [4]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
def getNanColNames(df):
    colnames = df.columns
    cols_con_na = []
    for col in colnames:
        if(df[col].isnull().sum() > 0):
            cols_con_na.append(col)
    return cols_con_na

In [7]:
getNanColNames(dataset)

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
def getCategoryVars(df):
    colnames = df.columns
    cat_cols = []
    for col in colnames:
        if(df[col].dtype == 'object'):
            cat_cols.append(col)
    return cat_cols
cat_cols = getCategoryVars(dataset)
cat_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

## Variables Categoricas

## Frequency Encoding 

In [11]:
def FreqEncoding(df, map_type='freq'):
    colnames = df.columns
    for col in colnames:
        if(df[col].dtype == 'object'):
            factor_div = 1 if (map_type == 'freq') else len(df[col])
            mapper = (df[col].value_counts().sort_values(ascending=False)/factor_div).to_dict()
            df[col] = df[col].map(mapper) 
            
FreqEncoding(dataset)
dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,1151.0,65.0,8450,1454.0,,925.0,1311.0,1459.0,...,0,,,,0,2,2008,1267.0,1198.0,208500
1,2,20,1151.0,80.0,9600,1454.0,,925.0,1311.0,1459.0,...,0,,,,0,5,2007,1267.0,1198.0,181500
2,3,60,1151.0,68.0,11250,1454.0,,484.0,1311.0,1459.0,...,0,,,,0,9,2008,1267.0,1198.0,223500
3,4,70,1151.0,60.0,9550,1454.0,,484.0,1311.0,1459.0,...,0,,,,0,2,2006,1267.0,101.0,140000
4,5,60,1151.0,84.0,14260,1454.0,,484.0,1311.0,1459.0,...,0,,,,0,12,2008,1267.0,1198.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,1151.0,62.0,7917,1454.0,,925.0,1311.0,1459.0,...,0,,,,0,8,2007,1267.0,1198.0,175000
1456,1457,20,1151.0,85.0,13175,1454.0,,925.0,1311.0,1459.0,...,0,,157.0,,0,2,2010,1267.0,1198.0,210000
1457,1458,70,1151.0,66.0,9042,1454.0,,925.0,1311.0,1459.0,...,0,,59.0,49.0,2500,5,2010,1267.0,1198.0,266500
1458,1459,20,1151.0,68.0,9717,1454.0,,925.0,1311.0,1459.0,...,0,,,,0,4,2010,1267.0,1198.0,142125


## Variables Numericas

In [12]:
def getContinuesCols(df):
    colnames = df.columns
    numeric_continues_vars = []
    for col in colnames:
        unique_values =len (df[col].unique())
        if((df[col].dtype != 'object') and (unique_values > 30)):
            numeric_continues_vars.append(col)
    return numeric_continues_vars
numeric_cont_vars = getContinuesCols(dataset)
numeric_cont_vars

['Id',
 'LotFrontage',
 'LotArea',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'GarageYrBlt',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'SalePrice']

In [29]:
dataset_temp = dataset.loc[:, ['LotArea', 'SalePrice']]
dataset_temp.head()

Unnamed: 0,LotArea,SalePrice
0,8450,208500
1,9600,181500
2,11250,223500
3,9550,140000
4,14260,250000


In [35]:
dataset_temp['LotArea_log'] = np.log(dataset['LotArea'])


In [36]:
dataset_temp['LotArea_inv'] = (1 / dataset['LotArea'])

In [37]:
dataset_temp['LotArea_quadratic'] = (dataset['LotArea']**2)

In [38]:
np.corrcoef(dataset_temp['LotArea_quadratic'], dataset_temp['SalePrice'])[0, 1]

0.1144684470003984

In [39]:
dataset_temp['LotArea_boxCox'], lambdaX = stats.boxcox(dataset_temp['LotArea'])

In [40]:
dataset_temp.to_csv('C:/Users/Klopez/Desktop/Statistical Learning/tarea_4.csv',index=False)