## Setup

### Notebook Settings

In [4]:
# Increase width of cells
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

# set working directory
import os
curr_dir = os.getcwd()
os.chdir(curr_dir)

### Package Imports

In [89]:
# package imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# data cleaning / processing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# feature selection
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropDuplicateFeatures, DropCorrelatedFeatures, SmartCorrelatedSelection
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# statistics
import scipy.stats as stats
import math as math

## Load Data

In [32]:
# Load data
data_url = 'https://storage.googleapis.com/kagglesdsdata/datasets/545987/1020485/train.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220426%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220426T225446Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=5950d93d3465eb8acb0f571cb2ece3289c4d61300451986a0b252ef2f7a4df9aadd4a28b9b216270a367b7e3a40737484990e7a8b40c1012eb6e32144a731364b2ba5d92629416ecb79677750d9c6a1c068a666bd7121748678664884110da47524765875a075d18a6b2b915b623034d9e9c2efb303f45287ab8bbb207a6403b128dbfc671a2475f39edd0d2086bd1310010cf30f0060f09df0f014c53860024495a58dd06979c3e123990c7853dcebb2b6e80749c17420cc5413babc3c07497116ff56065e673e16c4293c46939e03126243e1b6490a5c374ae5399fe75971362abe56b7cbc7385a10958249a37ba4287a05f5039e0123f59b0df7692088fac'
df = pd.read_csv(data_url).rename(columns=lambda x: x.replace(" ", "_").lower())
df.shape

(2197, 82)

In [8]:
# save df
df.to_pickle('../data/aimes_train_data.pkl')

In [28]:
# make a copy
df_copy = df.copy()

In [9]:
# view head
df.head()

Unnamed: 0,order,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
0,534,531363010,20,RL,80.0,9605,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,159000
1,803,906203120,20,RL,90.0,14684,Pave,,IR1,Lvl,...,0,,,,0,6,2009,WD,Normal,271900
2,956,916176030,20,RL,,14375,Pave,,IR1,Lvl,...,0,,,,0,1,2009,COD,Abnorml,137500
3,460,528180130,120,RL,48.0,6472,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,248500
4,487,528290030,80,RL,61.0,9734,Pave,,IR1,Lvl,...,0,,,,0,5,2009,WD,Normal,167000


## Data Inspection

In [10]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197 entries, 0 to 2196
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   order            2197 non-null   int64  
 1   pid              2197 non-null   int64  
 2   ms_subclass      2197 non-null   int64  
 3   ms_zoning        2197 non-null   object 
 4   lot_frontage     1835 non-null   float64
 5   lot_area         2197 non-null   int64  
 6   street           2197 non-null   object 
 7   alley            143 non-null    object 
 8   lot_shape        2197 non-null   object 
 9   land_contour     2197 non-null   object 
 10  utilities        2197 non-null   object 
 11  lot_config       2197 non-null   object 
 12  land_slope       2197 non-null   object 
 13  neighborhood     2197 non-null   object 
 14  condition_1      2197 non-null   object 
 15  condition_2      2197 non-null   object 
 16  bldg_type        2197 non-null   object 
 17  house_style   

In [12]:
# count features by type
print('No of Categorical Predictors: ', len(df.drop('saleprice', axis=1).select_dtypes(include='object').columns))
print('No of Numeric Predictors: ', len(df.drop('saleprice', axis=1).select_dtypes(exclude='object').columns))

No of Categorical Predictors:  43
No of Numeric Predictors:  38


In [13]:
# check for missing values
nulls = []
for feature in df.columns:
    if df[feature].isnull().sum() > 0:
        nulls.append(feature)

features_with_nulls_df = df[nulls].isnull().sum() / np.float(len(df))
features_with_nulls_df

lot_frontage      0.164770
alley             0.934911
mas_vnr_type      0.010014
mas_vnr_area      0.010014
bsmt_qual         0.030496
bsmt_cond         0.030496
bsmt_exposure     0.031406
bsmtfin_type_1    0.030496
bsmtfin_sf_1      0.000455
bsmtfin_type_2    0.030951
bsmtfin_sf_2      0.000455
bsmt_unf_sf       0.000455
total_bsmt_sf     0.000455
electrical        0.000455
bsmt_full_bath    0.000455
bsmt_half_bath    0.000455
fireplace_qu      0.485207
garage_type       0.054620
garage_yr_blt     0.055530
garage_finish     0.055530
garage_cars       0.000455
garage_area       0.000455
garage_qual       0.055530
garage_cond       0.055530
pool_qc           0.994538
fence             0.809285
misc_feature      0.963587
dtype: float64

In [33]:
# list of features with null proportion of null values >= 0.2
features_to_drop_names = features_with_nulls_df[features_with_nulls_df >= 0.2].index.tolist()
features_to_drop_names.extend(['order', 'pid'])
features_to_drop_names

['alley', 'fireplace_qu', 'pool_qc', 'fence', 'misc_feature', 'order', 'pid']

In [34]:
# drop unwanted features
df.drop(features_to_drop_names, axis=1, inplace=True)
df.shape

(2197, 75)

## Split Data

In [35]:
# get x and y features
X = df.drop('saleprice', axis=1)
y = df['saleprice']

In [36]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
X_train.shape, X_test.shape

((1537, 74), (660, 74))

In [37]:
# make copy of X_train and X_test
X_train_original = X_train.copy()
X_test_original = X_test.copy()

## Data Cleaning

### Impute Missing Values

In [38]:
# split train data into numeric and categorical features
num_df = X_train.select_dtypes(exclude='object')
cat_df = X_train.select_dtypes(include='object')

In [39]:
# numeric featues: impute missing values with median
for feature in num_df.columns:
    num_df[feature] = num_df[feature].fillna(num_df[feature].median())

In [40]:
# categorical featues: impute missing with model
for feature in cat_df.columns:
    cat_df[feature] = cat_df[feature].fillna(cat_df[feature].mode()[0])

## Feature Selection 1

### Drop Constants & Quasi - Contstants

In [41]:
# step 1: encode categorical variables
oe = OrdinalEncoder(handle_unknown='error')

# fit oe to categorical data
oe.fit(cat_df)

# fit categorical data
cat_df_enc = oe.transform(cat_df)

# convert to dataframe
cat_df_enc = pd.DataFrame(cat_df_enc, columns=cat_df.columns)
cat_df_enc = cat_df_enc.set_index(cat_df.index) 

In [42]:
# constants and quasi constants for numeric features
vt = VarianceThreshold(threshold=0.01)
vt.fit(num_df)
num_constants = num_df.columns[~vt.get_support()].tolist()
num_constants

[]

There are no numeric features that are constants or quasi-constants based on a 0.01 variance threshold.

In [43]:
# constants and quasi constants for numeric featues
vt.fit(cat_df_enc)
cat_constants = cat_df_enc.columns[~vt.get_support()].tolist()
cat_constants

['street', 'utilities']

In [44]:
# drop constants & quasi constants
X_train.drop(cat_constants, axis=1, inplace=True)
X_test.drop(cat_constants, axis=1, inplace=True)
X_train.shape, X_test.shape

((1537, 72), (660, 72))

In [52]:
# make copy of train and test set
X_train_1 = X_train.copy()
X_test_1 = X_test.copy()
X_train_1.shape, X_test_1.shape

((1537, 72), (660, 72))

### Drop Correlated Features (Numeric Features - Brute Force Method)

In [62]:
# set up selector
num_corr_sel = DropCorrelatedFeatures(threshold=0.80, method='pearson', missing_values='ignore')

# fit selector
num_corr_sel.fit(num_df)

# view correlated features
num_corr_sel.correlated_feature_sets_

[{'1st_flr_sf', 'total_bsmt_sf'},
 {'gr_liv_area', 'totrms_abvgrd'},
 {'garage_area', 'garage_cars'}]

In [67]:
# view dropped correlated features
num_corr_features_to_drop_names = list(num_corr_sel.features_to_drop_)
num_corr_features_to_drop_names

['1st_flr_sf', 'garage_area', 'totrms_abvgrd']

In [68]:
# drop correlated features 
X_train_2 = X_train_1.drop(num_corr_features_to_drop_names, axis=1)
X_test_2 = X_test_1.drop(num_corr_features_to_drop_names, axis=1)
X_train_2.shape, X_test_2.shape

((1537, 69), (660, 69))

### Categorical Variables to Drop TTest - 2 Levels

In [77]:
# get a list of categorical features with 2 levels

cat_featues_2level = []
for feature in cat_df.columns:
    if cat_df[feature].nunique() == 2:
        cat_featues_2level.append(feature)    
        print(feature, ':', cat_df[feature].nunique())

street : 2
central_air : 2


In [98]:
# perform ttest
feat    = []
stat    = []
ttest_p_value = []

for feature in cat_featues_2level:
    x = cat_df[[feature]].assign(sale_price = y_train)
    l1 = x[feature].unique()[0]
    l2 = x[feature].unique()[1]
    g1 = x[cat_df[feature]==l1].iloc[:, 1].values
    g2 = x[cat_df[feature]==l2].iloc[:, 1].values
    res = stats.ttest_ind(a=g1, b=g2, equal_var=False)
    feat.append(feature)
    stat.append(res[0])
    ttest_p_value.append(res[1])

In [115]:
ttest_res = pd.DataFrame({'feature':feat, 'stat':stat, 'p_value':ttest_p_value}).assign(sig = lambda x: np.where(x['p_value'] < 0.05, 'yes', 'no'))
ttest_res

Unnamed: 0,feature,stat,p_value,sig
0,street,5.27204,0.0006774355,yes
1,central_air,22.042847,1.4594910000000001e-55,yes


T Test for categorical features with 2 levels shows that mean of the 2 levels are not equal to 0. We reject the null hypothesis, both features are significant in predicting the outcome. 

In [112]:
ttest_res

Unnamed: 0,feature,stat,p_value,sig
0,street,5.27204,0.0006774355,yes
1,central_air,22.042847,1.4594910000000001e-55,yes


In [106]:
ttest_p_value < 0.05

TypeError: '<' not supported between instances of 'list' and 'float'

In [186]:
data = df[['ms_zoning', 'saleprice']]

In [124]:
for level in z['central_air'].unique():
    c = z[z['central_air']==level].to_dict()

In [228]:
for level in data.groupby('central_air'):
    samples = [central_air[1] for central_air in level[1].groupby('central_air')['saleprice']]
    f_val, p_val = stats.f_oneway(*samples)
    print('Name: {}, F value: {:.3f}, p value: {:.3f}'.format(level[0], f_val, p_val))

KeyError: 'central_air'

In [153]:
data.groupby('central_air').mean('saleprice')

Unnamed: 0_level_0,saleprice
central_air,Unnamed: 1_level_1
N,99737.932886
Y,188389.155273


In [227]:
data.groupby('ms_zoning')['saleprice']

IndexError: Column(s) saleprice already selected

In [126]:
z = pd.DataFrame({'Names': np.random.randint(1, 10, 1000), 
                   'value': np.random.randn(1000), 
                   'condition': np.random.choice(['NON', 'YES', 'RE'], 1000)})

In [146]:
data.head(2)

Unnamed: 0,central_air,saleprice
0,Y,159000
1,Y,271900


In [229]:
for name_group in z.groupby('Names'):
    samples = [condition[1] for condition in name_group[1].groupby('condition')['value']]

In [230]:
print(len(samples))

3


In [147]:
z

Unnamed: 0,Names,value,condition
0,9,-0.677821,NON
1,5,0.572780,NON
2,5,0.491275,YES
3,7,-0.886643,RE
4,6,-0.661630,NON
...,...,...,...
995,6,-0.298533,YES
996,2,0.738579,RE
997,2,0.248714,RE
998,5,-0.591187,RE


In [233]:
data.head(3)

Unnamed: 0,ms_zoning,saleprice
0,RL,159000
1,RL,271900
2,RL,137500


In [232]:
data.ms_zoning.unique()

array(['RL', 'RM', 'FV', 'RH', 'C (all)', 'I (all)', 'A (agr)'],
      dtype=object)

In [231]:
for name_group in data.groupby('ms_zoning'):
    s = []

('A (agr)',      ms_zoning  saleprice
989    A (agr)      13100
1789   A (agr)      81500)
('C (all)',      ms_zoning  saleprice
180    C (all)      78000
256    C (all)     119600
306    C (all)      80900
441    C (all)      65000
580    C (all)     102776
585    C (all)      34900
775    C (all)      89000
917    C (all)     115000
1293   C (all)      58500
1396   C (all)      85000
1425   C (all)      50138
1438   C (all)      81000
1457   C (all)     133900
1501   C (all)      51689
1859   C (all)      93369
1910   C (all)      44000
2104   C (all)      35311
2140   C (all)     124900
2144   C (all)      64500)
('FV',      ms_zoning  saleprice
51          FV     146000
70          FV     185000
84          FV     252678
87          FV     225000
129         FV     260000
...        ...        ...
2117        FV     170000
2133        FV     227875
2146        FV     168500
2150        FV     210000
2156        FV     176400

[113 rows x 2 columns])
('I (all)',     ms_zoning  salep

In [234]:
for i in 

(1,      Names     value condition
14       1  0.639113       NON
25       1 -0.965303       NON
26       1  1.159274       NON
30       1 -0.101353       NON
35       1  0.367069        RE
..     ...       ...       ...
948      1 -0.393525        RE
949      1  0.012219        RE
967      1  0.040480       YES
979      1 -0.386490       YES
999      1  1.272051       YES

[134 rows x 3 columns])
(2,      Names     value condition
7        2  0.335899       YES
11       2 -1.154009       NON
34       2  0.457625       YES
77       2 -0.443663        RE
94       2 -0.335391        RE
..     ...       ...       ...
950      2  1.760111       NON
963      2  0.709199        RE
988      2 -0.471353       YES
996      2  0.738579        RE
997      2  0.248714        RE

[96 rows x 3 columns])
(3,      Names     value condition
24       3 -0.229314       NON
38       3 -0.037697       YES
39       3 -0.359658       YES
44       3 -0.085213        RE
45       3  0.461825       YES
..     ..

In [235]:
z.Names.unique()

array([9, 5, 7, 6, 2, 4, 8, 1, 3])

In [236]:
samples

[0     -0.677821
 17     0.505216
 18     2.499657
 23     0.606519
 48    -0.480612
 51    -0.795110
 59    -0.157278
 67    -0.566520
 85     1.039364
 109    1.055146
 111    0.944766
 185    0.411331
 207   -1.036829
 228   -0.116404
 252   -0.116078
 305    0.754425
 382   -0.572061
 389   -1.077958
 434   -0.349825
 470   -1.126353
 500    0.501745
 551   -0.291591
 589   -1.480072
 602    0.411166
 605    0.586643
 631   -1.162619
 672   -1.361869
 679   -0.222387
 699   -0.600744
 729    2.215035
 733    1.794085
 773   -0.524503
 790    0.841649
 791    1.096891
 840   -2.118302
 868    0.688554
 894   -0.382277
 968    1.790932
 Name: value, dtype: float64,
 6     -0.493586
 52     1.690489
 56     0.368973
 68    -0.433754
 79    -0.012699
 92    -1.762657
 98     0.237963
 120   -0.161025
 145    0.894795
 204   -0.180786
 209    0.899694
 219    1.739933
 275   -1.560000
 276    0.505550
 298    0.962392
 354    0.684708
 364    0.044276
 373   -0.395700
 413    0.595108
 

In [238]:
z[z['Names'] == 9]

Unnamed: 0,Names,value,condition
0,9,-0.677821,NON
6,9,-0.493586,RE
17,9,0.505216,NON
18,9,2.499657,NON
23,9,0.606519,NON
...,...,...,...
954,9,-2.716333,RE
968,9,1.790932,NON
969,9,-0.366249,YES
972,9,0.651489,YES


In [165]:
data

Unnamed: 0,central_air,saleprice
0,Y,159000
1,Y,271900
2,Y,137500
3,Y,248500
4,Y,167000
...,...,...
2192,Y,220000
2193,Y,160000
2194,Y,225000
2195,N,83000


In [None]:
for saleprice in 

In [166]:
data.iloc[:, 1]

0       159000
1       271900
2       137500
3       248500
4       167000
         ...  
2192    220000
2193    160000
2194    225000
2195     83000
2196    250000
Name: saleprice, Length: 2197, dtype: int64

In [172]:
data.set_index('central_air').loc['Y'].values

array([[159000],
       [271900],
       [137500],
       ...,
       [160000],
       [225000],
       [250000]])

In [184]:
ft = 'ms_zoning'

In [187]:
for level in data[ft].unique():
    levels = data[ft].nunique()
    dataframe = data.set_index(ft)
    

In [188]:
levels

7

In [189]:
dataframe

Unnamed: 0_level_0,saleprice
ms_zoning,Unnamed: 1_level_1
RL,159000
RL,271900
RL,137500
RL,248500
RL,167000
...,...
RL,220000
RH,160000
RL,225000
RL,83000


In [198]:
fvalue, pvalue = stats.f_oneway([1,2,3], [4,5,6])

In [199]:
fvalue

13.5

In [197]:
my_list = [[1,2,3], [4,5,6]]

In [200]:
for i in my_list:
    print(i)

[1, 2, 3]
[4, 5, 6]


In [218]:
for i in range(len(my_list)):
    fvalue, pvalue = stats.f_oneway(my_list[i])

ValueError: zero-dimensional arrays cannot be concatenated

In [221]:
my_list[0] + ',' + my_list[1]

TypeError: can only concatenate list (not "str") to list

In [205]:
for i in range(len(my_list)):
    print(i)

0
1


In [204]:
len(my_list)

2