In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer as Imputer
from xgboost import XGBRegressor

In [44]:
df_train = pd.read_csv('train.csv', index_col='Id')
df_test = pd.read_csv('test.csv', index_col='Id')
combined_data_list = [df_train, df_test]
combined_data = pd.concat([df_train.drop(columns='SalePrice'), df_test])

In [47]:
def isnull_stats(df, isnull_text_addition):
    df = pd.DataFrame(df.isnull().sum())/df.shape[0] * 100
    df = df.reset_index()
    df.columns = ['feature', 'isnull_' + isnull_text_addition + '_%']
    df['isnull_' + isnull_text_addition + '_%'] = df['isnull_' + isnull_text_addition + '_%'].apply(lambda x: int(x*100) / 100)
    df = df[df['isnull_' + isnull_text_addition + '_%']!=0].sort_values('isnull_' + isnull_text_addition + '_%', ascending=False)
    return df

def isnull_train_test_stats(df_train, df_test):
    df_train_missing_values = isnull_stats(df=df_train, isnull_text_addition='train')
    df_test_missing_values = isnull_stats(df=df_test, isnull_text_addition='test')
    df_together_missing_values = df_train_missing_values.merge(df_test_missing_values, on='feature', how='outer').fillna(0)
    col_names = df_together_missing_values.columns
    df_together_missing_values['abs_diff_%'] = abs(df_together_missing_values[col_names[1]] - df_together_missing_values[col_names[2]])

    df_dtypes = df_train.dtypes.reset_index()
    df_dtypes.columns = ['feature', 'dtype']

    df_together_missing_values = df_together_missing_values.merge(df_dtypes, on='feature')

    return df_together_missing_values

df_missing_values = isnull_train_test_stats(df_train, df_test)
df_missing_values

Unnamed: 0,feature,isnull_train_%,isnull_test_%,abs_diff_%,dtype
0,PoolQC,99.52,99.79,0.27,object
1,MiscFeature,96.3,96.5,0.2,object
2,Alley,93.76,92.66,1.1,object
3,Fence,80.75,80.12,0.63,object
4,FireplaceQu,47.26,50.03,2.77,object
5,LotFrontage,17.73,15.55,2.18,float64
6,GarageType,5.54,5.2,0.34,object
7,GarageYrBlt,5.54,5.34,0.2,float64
8,GarageFinish,5.54,5.34,0.2,object
9,GarageQual,5.54,5.34,0.2,object


In [77]:
def value_names_not_in_missing_values_threshold(df_missing_values, combined_data, missing_val_threshold):
    values_to_drop_due_threshold = df_missing_values[df_missing_values['isnull_train_%'] >= missing_val_threshold*100]
    values_to_drop_due_threshold = list(values_to_drop_due_threshold.feature)
    print('drop due threshhold ' + str(missing_val_threshold) + ': ', values_to_drop_due_threshold)
    #! we can detect 1 if value is not missing or 0 in value is missing, smth like this
    col_names_to_keep = [x for x in combined_data.columns if x not in values_to_drop_due_threshold]
    return col_names_to_keep

list_of_values_to_keep = value_names_not_in_missing_values_threshold(df_missing_values, combined_data, missing_val_threshold=0.7)

X = df_train[list_of_values_to_keep]
y = df_train['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

drop due threshhold 0.7:  ['PoolQC', 'MiscFeature', 'Alley', 'Fence']


In [78]:
# now we make a pipeline
# need to extract text and numeric columns
num_cols = X.select_dtypes(include=np.number).columns.tolist()
obj_cols = [_ for _ in X.columns if _ not in num_cols]

# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x[obj_cols], validate=False)
# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[num_cols], validate=False)
# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(df_train)
# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(df_train)
# Print head to check results
print('Text Data')
print(just_text_data.columns)
print('\nNumeric Data')
print(just_numeric_data.columns)

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder
# Split using ALL data in sample_df


mapper = DataFrameMapper(
    [(d, LabelEncoder()) for d in obj_cols]
)

# Create a FeatureUnion with nested pipeline: process_and_join_features
process_and_join_features = FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer(strategy='mean'))
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    # ('imputer', Imputer(strategy='most_frequent')),
                    ('mapper', mapper),
                    ('onehot', OneHotEncoder())
                ]))
             ]
        )
# Instantiate nested pipeline: pl
pl = Pipeline([
        ('union', process_and_join_features),
        ('reg', XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8))
    ])
# Fit pl to the training data
pl.fit(X_train, y_train)

Text Data
Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

Numeric Data
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function <lambda> at 0x0000019B99429820>)),
                                                                 ('imputer',
                                                                  SimpleImputer())])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function <lambda> at 0x0000019B99429940>)),
                                                                 ('mapper',
                                                                  DataFrameMapper(drop_cols=[]...
                              colsample_bytree=0.8, eta=0.1, gamma=0, gpu

In [83]:
combined_data['SaleType'].mode()

0    WD
dtype: object

2525

In [86]:
combined_data['SaleType'].mode()[0]

'WD'