In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')

In [191]:
# housing_pp5 has only normal partial and abnormal sales
df1 = pd.read_csv('housing_pp5.csv', index_col=0)
y1 = df1.SalePrice
X1 = df1.drop(['SalePrice', 'PID'], axis =1).copy()


In [224]:
df1.SaleCondition.unique()

array(['Normal', 'Partial', 'Abnorml'], dtype=object)

In [228]:
import plotly.express as px
fig = px.box(df1, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [288]:
quality = range(1,11)
indexes_ = []

for qual in quality:
    outliers = find_outliers_IQR(df1.loc[df1.OverallQual == qual].SalePrice)
    indexes_.extend(list(outliers.index))

In [289]:
len(indexes_)

80

In [297]:
df1_ = df1.copy()

In [298]:
df1_.drop(indexes_, inplace=True)


In [299]:
y1_ = df1_.SalePrice
X1_ = df1_.drop(['SalePrice', 'PID'], axis =1).copy()

In [192]:
# housing_pp4 has all the sale conditons that are not normal dropped
df2 = pd.read_csv('housing_pp4.csv', index_col=0)
y2 = df2.SalePrice
X2 = df2.drop(['SalePrice', 'PID'], axis =1).copy()

In [225]:
df2.SaleCondition.unique()

array(['Normal'], dtype=object)

In [229]:
fig = px.box(df2, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [233]:
df2.loc[df2.OverallQual == 8].SalePrice

4       227000
10      219500
26      201000
67      254000
69      284500
         ...  
2520    170000
2521    255500
2536    257500
2556    240000
2579    215000
Name: SalePrice, Length: 256, dtype: int64

In [234]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [264]:
outliers = find_outliers_IQR(df2.loc[df2.OverallQual == 1].SalePrice)

print("number of outliers:" + str(len(outliers)))

print("max outlier value:" + str(outliers.max()))

print("min outlier value:" + str(outliers.min()))

number of outliers:0
max outlier value:nan
min outlier value:nan


In [285]:
outliers = find_outliers_IQR(df2.SalePrice)
print("number of outliers:" + str(len(outliers)))

number of outliers:95


In [265]:
indexes = list(outliers.index)
indexes

[]

In [290]:
quality = range(1,11)
indexes = []

for qual in quality:
    outliers = find_outliers_IQR(df2.loc[df2.OverallQual == qual].SalePrice)
    indexes.extend(list(outliers.index))
    

In [291]:
len(indexes)

80

In [278]:
df3 = df2.copy()

In [279]:
# df3 is now all non normal and then outliers of saleprice removed 
# based on outliers located at each overall qual 
df3.drop(indexes, inplace=True)

In [280]:
fig = px.box(df3, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [306]:
fig = px.box(df3, x = "OverallQual", y="GrLivArea", points="all")
fig.show()

In [305]:
outliers = find_outliers_IQR(df3.GrLivArea)
outliers

23      2640
228     2810
244     2956
300     3005
342     2643
487     2646
524     3447
595     2784
624     2787
635     2855
729     2758
781     2826
848     2687
930     2727
956     2787
962     2668
1007    4316
1099    2715
1144    2772
1146    2794
1162    3493
1203    3082
1243    2730
1310    2687
1317    3078
1325    3627
1405    2673
1437    2786
1497    2787
1586    2840
1639    2674
1668    2726
1686    2872
1968    2690
1999    2646
2004    3608
2028    2726
2128    2790
2168    2775
2204    2650
2208    2654
2218    2704
2282    2640
2398    2978
2404    2799
Name: GrLivArea, dtype: int64

In [281]:
y3 = df3.SalePrice
X3 = df3.drop(['SalePrice', 'PID'], axis =1).copy()

In [230]:
fig = px.box(df2, x = "GrLivArea", y="SalePrice", points="all")
fig.show()

In [210]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns1 = numerical_columns_selector(X1)
categorical_columns1 = categorical_columns_selector(X1)
numerical_columns2 = numerical_columns_selector(X2)
categorical_columns2 = categorical_columns_selector(X2)

#categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor1 = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns1),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns1),
    ]
)

preprocessor2 = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns2),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns2),
    ]
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

lasso = linear_model.Lasso()

lasso_pipe1 = Pipeline(steps=[("Preprocess", preprocessor1), ("Lasso", lasso)])

lasso_pipe2 = Pipeline(steps=[("Preprocess", preprocessor2), ("Lasso", lasso)])

param_grid = {
    "regressor__Lasso__alpha": [0.002, 0.00205263, 0.00210526, 0.00215789, 0.00221053,
       0.00226316, 0.00231579, 0.00236842, 0.00242105, 0.00247368,
       0.00252632, 0.00257895, 0.00263158, 0.00268421, 0.00273684,
       0.00278947, 0.00284211, 0.00289474, 0.00294737, 0.003    ]
}

lasso_regr1 = compose.TransformedTargetRegressor(regressor= lasso_pipe1,
                                                func=np.log, inverse_func=np.exp)
lasso_regr2 = compose.TransformedTargetRegressor(regressor= lasso_pipe2,
                                                func=np.log, inverse_func=np.exp)

search1 = GridSearchCV(lasso_regr1, param_grid, n_jobs=2, cv = kf)

search2 = GridSearchCV(lasso_regr2, param_grid, n_jobs=2, cv = kf)

In [221]:
ridge = linear_model.Ridge()

ridge_pipe1 = Pipeline(steps=[("Preprocess", preprocessor1), ("Ridge", ridge)])

ridge_pipe2 = Pipeline(steps=[("Preprocess", preprocessor2), ("Ridge", ridge)])

param_gridR = {
    "regressor__Ridge__alpha": [193.        , 193.57894737, 194.15789474, 194.73684211,
       195.31578947, 195.89473684, 196.47368421, 197.05263158,
       197.63157895, 198.21052632, 198.78947368, 199.36842105,
       199.94736842, 200.52631579, 201.10526316, 201.68421053,
       202.26315789, 202.84210526, 203.42105263, 204.         ] 
}

ridge_regr1 = compose.TransformedTargetRegressor(regressor= ridge_pipe1,
                                                func=np.log, inverse_func=np.exp)
ridge_regr2 = compose.TransformedTargetRegressor(regressor= ridge_pipe2,
                                                func=np.log, inverse_func=np.exp)

Rsearch1 = GridSearchCV(ridge_regr1, param_gridR, n_jobs=2, cv = kf)

Rsearch2 = GridSearchCV(ridge_regr2, param_gridR, n_jobs=2, cv = kf)

In [195]:
search1.fit(X1, y1)
print(search1.best_score_)
print(search1.best_params_)

0.8846743626007652
{'regressor__Lasso__alpha': 0.001}


In [300]:
search1.fit(X1_, y1_)
print(search1.best_score_)
print(search1.best_params_)

0.8811766190752245
{'regressor__Lasso__alpha': 0.002}


In [211]:
search2.fit(X2, y2)
print(search2.best_score_)
print(search2.best_params_)

0.9344881872898778
{'regressor__Lasso__alpha': 0.00273684}


In [283]:
# this is the group with all the non normal sales dropped AND
# outliers withtin each set of overall quality dropped 
search2.fit(X3, y3)
print(search2.best_score_)
print(search2.best_params_)

0.9427332327956236
{'regressor__Lasso__alpha': 0.002}


In [399]:
search2.fit(X3_, y3)
print(search2.best_score_)
print(search2.best_params_)

0.9442205918319043
{'regressor__Lasso__alpha': 0.002}


In [None]:
0.8825164812004702
0.9308283561200676

## Ridge

In [197]:
Rsearch1.fit(X1, y1)
print(Rsearch1.best_score_)
print(Rsearch1.best_params_)

0.8922082306532779
{'regressor__Ridge__alpha': 100}


In [222]:
Rsearch2.fit(X2, y2)
print(Rsearch2.best_score_)
print(Rsearch2.best_params_)

0.9350065918339736
{'regressor__Ridge__alpha': 196.47368421}


In [282]:
Rsearch2.fit(X3, y3)
print(Rsearch2.best_score_)
print(Rsearch2.best_params_)

0.9437810435256058
{'regressor__Ridge__alpha': 193.0}


In [397]:
X3_ = X3.copy()

In [403]:
X3_.OverallQual = (X3.OverallQual)**2
X3_.KitchenQual = (X3.KitchenQual)**3
X3_.ExterQual = (X3.ExterQual)**3

In [410]:
Rsearch2.fit(X3_, y3)
print(Rsearch2.best_score_)
print(Rsearch2.best_params_)

0.9456613704599027
{'regressor__Ridge__alpha': 193.0}


In [405]:
Rsearch2.fit(X3_, y3)
print(Rsearch2.score)
print(Rsearch2.best_params_)

0.9456613704599027
{'regressor__Ridge__alpha': 193.0}


In [411]:
Rsearch2.cv_results_

{'mean_fit_time': array([0.03900299, 0.03050685, 0.03004951, 0.02954063, 0.03290691,
        0.03274708, 0.03245897, 0.03728008, 0.03675928, 0.03293667,
        0.03150077, 0.03140016, 0.03098817, 0.03145566, 0.03131652,
        0.02897711, 0.03150721, 0.02849078, 0.02827048, 0.02790256]),
 'std_fit_time': array([0.00712204, 0.00048426, 0.00125859, 0.00062857, 0.00126649,
        0.00231798, 0.00097935, 0.00378914, 0.00374915, 0.00108677,
        0.00064864, 0.00042039, 0.00074533, 0.00117392, 0.00080181,
        0.00103502, 0.0010683 , 0.00048087, 0.00089658, 0.00074127]),
 'mean_score_time': array([0.00998106, 0.00932865, 0.00972662, 0.00877805, 0.00932155,
        0.00899653, 0.00836215, 0.00911484, 0.01031284, 0.00916352,
        0.00870476, 0.00879478, 0.0084229 , 0.00889435, 0.0081141 ,
        0.00792618, 0.00902386, 0.00840802, 0.00813155, 0.00829988]),
 'std_score_time': array([0.00037026, 0.00058659, 0.00101557, 0.00065785, 0.00204423,
        0.00061848, 0.00027865, 0.000533

### Try different models 

In [307]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

numerical_columns1 = numerical_columns_selector(X1)
categorical_columns1 = categorical_columns_selector(X1)

#categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)


kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [309]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X3, y3, random_state=42
)

In [355]:
regressor = DecisionTreeRegressor()

numerical_columns = numerical_columns_selector(X3)
categorical_columns = categorical_columns_selector(X3)

preprocessor_tree = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        
    ]
)

In [356]:
tree_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("tree", regressor)])
tree_reg = compose.TransformedTargetRegressor(regressor= tree_pipe,
                                                func=np.log, inverse_func=np.exp)

In [357]:
tree_reg.fit(X_train, y_train)

In [358]:
#slightly better but bad 
tree_reg.score(X_test, y_test)

0.8112224735799738

In [359]:
tree_pipe.fit(X_train, y_train)
tree_pipe.score(X_test, y_test)

0.7911928424836605

In [360]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("boost", gbr)])
gbr_reg = compose.TransformedTargetRegressor(regressor= gbr_pipe,
                                                func=np.log, inverse_func=np.exp)

In [361]:
gbr_pipe.fit(X_train, y_train)
gbr_pipe.score(X_test, y_test)

0.925694386269265

In [362]:
gbr_reg.fit(X_train, y_train)
gbr_reg.score(X_test, y_test)

0.9301187660874297

In [537]:
df_all = pd.read_csv('df_all.csv', index_col=0)
y1 = df_all.SalePrice
X1 = df_all.drop(['SalePrice', 'PID'], axis =1).copy()

df_all_normal = pd.read_csv('df_all_normal.csv', index_col=0)
y2 = df_all_normal.SalePrice
X2 = df_all_normal.drop(['SalePrice', 'PID'], axis =1).copy()

df_faa = pd.read_csv('df_faa.csv', index_col=0)
y3 = df_faa.SalePrice
X3 = df_faa.drop(['SalePrice', 'PID'], axis =1).copy()

df_normal_quality = pd.read_csv('df_normal_quality.csv', index_col=0)
y4 = df_normal_quality.SalePrice
X4 = df_normal_quality.drop(['SalePrice', 'PID'], axis =1).copy()

df_quality = pd.read_csv('df_quality.csv', index_col=0)
y5 = df_quality.SalePrice
X5 = df_quality.drop(['SalePrice', 'PID'], axis =1).copy()

df_some_quality = pd.read_csv('df_some_quality.csv', index_col=0)
y6 = df_some_quality.SalePrice
X6 = df_some_quality.drop(['SalePrice', 'PID'], axis =1).copy()

df = pd.read_csv('df.csv', index_col=0)
y7 = df.SalePrice
X7 = df.drop(['SalePrice', 'PID'], axis =1).copy()

In [548]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

#ridge pipe
ridge = linear_model.Ridge()

ridge_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ridge", ridge)])

ridge_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Ridge", ridge)])


ridge_regr_drop = compose.TransformedTargetRegressor(regressor= ridge_pipe_drop,
                                                func=np.log, inverse_func=np.exp)

ridge_regr = compose.TransformedTargetRegressor(regressor= ridge_pipe,
                                                func=np.log, inverse_func=np.exp)

param_gridR = {
    "regressor__Ridge__alpha": [10, 15, 20, 25, 30] 
}

ridge_search_drop = GridSearchCV(ridge_regr_drop, param_gridR, n_jobs=2, cv = kf)

ridge_search = GridSearchCV(ridge_regr, param_gridR, n_jobs=2, cv = kf)

# lasso pipe
lasso = linear_model.Lasso()
lasso_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Lasso", lasso)])
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

lasso_regr_drop = compose.TransformedTargetRegressor(regressor= lasso_pipe_drop,
                                                func=np.log, inverse_func=np.exp)
lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
param_gridL = {
    "regressor__Lasso__alpha": [0, .01, .001, .0001, .00001] 
}
lasso_search_drop = GridSearchCV(lasso_regr_drop, param_gridL, n_jobs=2, cv = kf)
lasso_search = GridSearchCV(lasso_regr, param_gridL, n_jobs=2, cv = kf)

ols = linear_model.LinearRegression()
ols_pipe = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ols", ols)])
ols_regr = compose.TransformedTargetRegressor(regressor= ols_pipe,
                                                func=np.log, inverse_func=np.exp)



In [554]:
count = np.isinf(X4).values.sum() 
print(count) 

TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [549]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ols_regr, X3, y3, cv = 5, error_score= 'raise')
scores

ValueError: Input contains infinity or a value too large for dtype('float64').

In [446]:
A = [X7, X1, X2, X3, X4, X5, X6]
B = [y7, y1, y2, y3, y4, y5, y6]
names = ['only massive home removed',
        'all outliers removed',
        'only normal sale condition',
        'Family, AdjLand, Alloca sales removed',
        'all non-normal removed and outliers within quality groups',
        'outliers within quality groups removed',
        'outliers within quality groups removed and Family, AdjLand, Alloca']

for a, b, names in zip(A, B, names):
    
    ridge_search.fit(a,b)
    ridge_search_drop.fit(a,b)
    lasso_search.fit(a,b)
    lasso_search_drop.fit(a,b)
    print('Ridge', names, ridge_search.best_score_, ridge_search.best_params_)
    print('Ridge_drop', names, ridge_search_drop.best_score_, ridge_search_drop.best_params_)
    print('Lasso', names, lasso_search.best_score_, lasso_search.best_params_)
    print('Lasso_drop', names, lasso_search_drop.best_score_, lasso_search_drop.best_params_)
    print('******************')

Ridge only massive home removed 0.9302900774188668 {'regressor__Ridge__alpha': 30}
Ridge_drop only massive home removed 0.9295762318805686 {'regressor__Ridge__alpha': 30}
Lasso only massive home removed 0.9302652671392021 {'regressor__Lasso__alpha': 0.001}
Lasso_drop only massive home removed 0.9301705698457059 {'regressor__Lasso__alpha': 0.001}
******************
Ridge all outliers removed 0.9174710003615459 {'regressor__Ridge__alpha': 20}
Ridge_drop all outliers removed 0.9171449136063655 {'regressor__Ridge__alpha': 15}
Lasso all outliers removed 0.9170080312325768 {'regressor__Lasso__alpha': 0.0001}
Lasso_drop all outliers removed 0.917342677699342 {'regressor__Lasso__alpha': 0.0001}
******************
Ridge only normal sale condition 0.9310480965495499 {'regressor__Ridge__alpha': 30}
Ridge_drop only normal sale condition 0.9302979936852978 {'regressor__Ridge__alpha': 30}
Lasso only normal sale condition 0.9331572108449813 {'regressor__Lasso__alpha': 0.001}
Lasso_drop only normal sa

In [454]:
Data = []
Ridge = []
Ridge_drop = []
Lasso = []
Lasso_drop = []

A = [X7, X1, X2, X3, X4, X5, X6]
B = [y7, y1, y2, y3, y4, y5, y6]
names = ['Over 4000',
        'All',
        'All non-normal',
        'Family, AdjLand, Alloca Sales',
        'Non-normal & within quality groups',
        'Within quality groups',
        'Within quality & Family, AdjLand, Alloca']

for a, b, names in zip(A, B, names):
    
    ridge_search.fit(a,b)
    ridge_search_drop.fit(a,b)
    lasso_search.fit(a,b)
    lasso_search_drop.fit(a,b)
    ols_scores = cross_val_score(ols_regr,a, b, cv=kf)
    
    Data.append(names)
    Ridge.append(ridge_search.best_score_)
    Ridge_drop.append(ridge_search_drop.best_score_)
    Lasso.append(lasso_search.best_score_)
    Lasso_drop.append(lasso_search_drop.best_score_)
    
scores = {}
scores['Outliers Removed'] = Data
scores['Ridge Score'] = Ridge
scores['Ridge drop Score'] = Ridge_drop
scores['Lasso Score'] = Lasso
scores['Lasso drop Score'] = Lasso_drop

pd.DataFrame.from_dict(scores)

Unnamed: 0,Outliers Removed,Ridge Score,Ridge drop Score,Lasso Score,Lasso drop Score
0,Over 4000,0.93029,0.929576,0.930265,0.930171
1,All,0.917471,0.917145,0.917008,0.917343
2,All non-normal,0.931048,0.930298,0.933157,0.932939
3,"Family, AdjLand, Alloca Sales",0.935069,0.934044,0.933835,0.933441
4,Non-normal & within quality groups,0.949678,0.949513,0.949658,0.949622
5,Within quality groups,0.946657,0.946473,0.945962,0.946116
6,"Within quality & Family, AdjLand, Alloca",0.946958,0.946809,0.947077,0.946925


In [451]:
Ridge

[0.9302900774188668,
 0.9174710003615459,
 0.9310480965495499,
 0.9350686182912279,
 0.9496780813318221,
 0.9466568992608474,
 0.9469576757398135]

In [452]:
scores = {}
scores['Group'] = Data
scores['Ridge Score'] = Ridge
scores

{'Group': ['only massive home removed',
  'all outliers removed',
  'only normal sale condition',
  'Family, AdjLand, Alloca sales removed',
  'all non-normal removed and outliers within quality groups',
  'outliers within quality groups removed',
  'outliers within quality groups removed and Family, AdjLand, Alloca'],
 'Ridge Score': [0.9302900774188668,
  0.9174710003615459,
  0.9310480965495499,
  0.9350686182912279,
  0.9496780813318221,
  0.9466568992608474,
  0.9469576757398135]}

In [453]:
pd.DataFrame.from_dict(scores)

Unnamed: 0,Group,Ridge Score
0,only massive home removed,0.93029
1,all outliers removed,0.917471
2,only normal sale condition,0.931048
3,"Family, AdjLand, Alloca sales removed",0.935069
4,all non-normal removed and outliers within qua...,0.949678
5,outliers within quality groups removed,0.946657
6,outliers within quality groups removed and Fam...,0.946958


In [456]:
scores['Ridge Score']

[0.9302900774188668,
 0.9174710003615459,
 0.9310480965495499,
 0.9350686182912279,
 0.9496780813318221,
 0.9466568992608474,
 0.9469576757398135]