## Let's analyze each column

- what is the unique values
- what are the value range
- how many empty values

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sigfig import round
pd.set_option('display.max_colwidth', None)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
data = pd.read_csv("train.csv")

In [4]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [9]:
def summarize(col):
    output = set(col)
    if len(output) <= 20:
        counts = col.value_counts()
        output = [f"{val} ({count})" for val, count in zip(counts.index, counts)]
        nulls = col.isna().sum()
        if nulls:
            output.append(f"None ({nulls})")
        return output
    output = sorted(set(map(str, col)))
    return f'{output[:3] + ["...", max(col), min(col), "..."] + output[-3:]}, Null count: {col.isna().sum()}/{len(col)}'

for n in range(len(data.columns)):
    print(f"{data.columns[n]}: {summarize(data[data.columns[n]])}")

Id: ['1', '10', '100', '...', 1460, 1, '...', '997', '998', '999'], Null count: 0/1460
MSSubClass: ['20 (536)', '60 (299)', '50 (144)', '120 (87)', '30 (69)', '160 (63)', '70 (60)', '80 (58)', '90 (52)', '190 (30)', '85 (20)', '75 (16)', '45 (12)', '180 (10)', '40 (4)']
MSZoning: ['RL (1151)', 'RM (218)', 'FV (65)', 'RH (16)', 'C (all) (10)']
LotFrontage: ['100.0', '101.0', '102.0', '...', 313.0, 21.0, '...', '98.0', '99.0', 'nan'], Null count: 259/1460
LotArea: ['10000', '10004', '10005', '...', 215245, 1300, '...', '9967', '9986', '9991'], Null count: 0/1460
Street: ['Pave (1454)', 'Grvl (6)']
Alley: ['Grvl (50)', 'Pave (41)', 'None (1369)']
LotShape: ['Reg (925)', 'IR1 (484)', 'IR2 (41)', 'IR3 (10)']
LandContour: ['Lvl (1311)', 'Bnk (63)', 'HLS (50)', 'Low (36)']
Utilities: ['AllPub (1459)', 'NoSeWa (1)']
LotConfig: ['Inside (1052)', 'Corner (263)', 'CulDSac (94)', 'FR2 (47)', 'FR3 (4)']
LandSlope: ['Gtl (1382)', 'Mod (65)', 'Sev (13)']
Neighborhood: ['Blmngtn', 'Blueste', 'BrDale',

In [7]:
list(zip(data[data.columns[3]].value_counts().index, data[data.columns[3]].value_counts()))[-5:]

[(182.0, 1), (160.0, 1), (152.0, 1), (153.0, 1), (46.0, 1)]

In [8]:
f"{7:02}"[:7]

'07'