In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')

In [191]:
# housing_pp5 has only normal partial and abnormal sales
df1 = pd.read_csv('housing_pp5.csv', index_col=0)
y1 = df1.SalePrice
X1 = df1.drop(['SalePrice', 'PID'], axis =1).copy()


In [224]:
df1.SaleCondition.unique()

array(['Normal', 'Partial', 'Abnorml'], dtype=object)

In [228]:
import plotly.express as px
fig = px.box(df1, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [288]:
quality = range(1,11)
indexes_ = []

for qual in quality:
    outliers = find_outliers_IQR(df1.loc[df1.OverallQual == qual].SalePrice)
    indexes_.extend(list(outliers.index))

In [289]:
len(indexes_)

80

In [297]:
df1_ = df1.copy()

In [298]:
df1_.drop(indexes_, inplace=True)


In [299]:
y1_ = df1_.SalePrice
X1_ = df1_.drop(['SalePrice', 'PID'], axis =1).copy()

In [192]:
# housing_pp4 has all the sale conditons that are not normal dropped
df2 = pd.read_csv('housing_pp4.csv', index_col=0)
y2 = df2.SalePrice
X2 = df2.drop(['SalePrice', 'PID'], axis =1).copy()

In [225]:
df2.SaleCondition.unique()

array(['Normal'], dtype=object)

In [229]:
fig = px.box(df2, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [233]:
df2.loc[df2.OverallQual == 8].SalePrice

4       227000
10      219500
26      201000
67      254000
69      284500
         ...  
2520    170000
2521    255500
2536    257500
2556    240000
2579    215000
Name: SalePrice, Length: 256, dtype: int64

In [234]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [264]:
outliers = find_outliers_IQR(df2.loc[df2.OverallQual == 1].SalePrice)

print("number of outliers:" + str(len(outliers)))

print("max outlier value:" + str(outliers.max()))

print("min outlier value:" + str(outliers.min()))

number of outliers:0
max outlier value:nan
min outlier value:nan


In [285]:
outliers = find_outliers_IQR(df2.SalePrice)
print("number of outliers:" + str(len(outliers)))

number of outliers:95


In [265]:
indexes = list(outliers.index)
indexes

[]

In [290]:
quality = range(1,11)
indexes = []

for qual in quality:
    outliers = find_outliers_IQR(df2.loc[df2.OverallQual == qual].SalePrice)
    indexes.extend(list(outliers.index))
    

In [291]:
len(indexes)

80

In [278]:
df3 = df2.copy()

In [279]:
# df3 is now all non normal and then outliers of saleprice removed 
# based on outliers located at each overall qual 
df3.drop(indexes, inplace=True)

In [280]:
fig = px.box(df3, x = "OverallQual", y="SalePrice", points="all")
fig.show()

In [306]:
fig = px.box(df3, x = "OverallQual", y="GrLivArea", points="all")
fig.show()

In [305]:
outliers = find_outliers_IQR(df3.GrLivArea)
outliers

23      2640
228     2810
244     2956
300     3005
342     2643
487     2646
524     3447
595     2784
624     2787
635     2855
729     2758
781     2826
848     2687
930     2727
956     2787
962     2668
1007    4316
1099    2715
1144    2772
1146    2794
1162    3493
1203    3082
1243    2730
1310    2687
1317    3078
1325    3627
1405    2673
1437    2786
1497    2787
1586    2840
1639    2674
1668    2726
1686    2872
1968    2690
1999    2646
2004    3608
2028    2726
2128    2790
2168    2775
2204    2650
2208    2654
2218    2704
2282    2640
2398    2978
2404    2799
Name: GrLivArea, dtype: int64

In [281]:
y3 = df3.SalePrice
X3 = df3.drop(['SalePrice', 'PID'], axis =1).copy()

In [230]:
fig = px.box(df2, x = "GrLivArea", y="SalePrice", points="all")
fig.show()

In [210]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns1 = numerical_columns_selector(X1)
categorical_columns1 = categorical_columns_selector(X1)
numerical_columns2 = numerical_columns_selector(X2)
categorical_columns2 = categorical_columns_selector(X2)

#categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor1 = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns1),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns1),
    ]
)

preprocessor2 = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns2),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns2),
    ]
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

lasso = linear_model.Lasso()

lasso_pipe1 = Pipeline(steps=[("Preprocess", preprocessor1), ("Lasso", lasso)])

lasso_pipe2 = Pipeline(steps=[("Preprocess", preprocessor2), ("Lasso", lasso)])

param_grid = {
    "regressor__Lasso__alpha": [0.002, 0.00205263, 0.00210526, 0.00215789, 0.00221053,
       0.00226316, 0.00231579, 0.00236842, 0.00242105, 0.00247368,
       0.00252632, 0.00257895, 0.00263158, 0.00268421, 0.00273684,
       0.00278947, 0.00284211, 0.00289474, 0.00294737, 0.003    ]
}

lasso_regr1 = compose.TransformedTargetRegressor(regressor= lasso_pipe1,
                                                func=np.log, inverse_func=np.exp)
lasso_regr2 = compose.TransformedTargetRegressor(regressor= lasso_pipe2,
                                                func=np.log, inverse_func=np.exp)

search1 = GridSearchCV(lasso_regr1, param_grid, n_jobs=2, cv = kf)

search2 = GridSearchCV(lasso_regr2, param_grid, n_jobs=2, cv = kf)

In [221]:
ridge = linear_model.Ridge()

ridge_pipe1 = Pipeline(steps=[("Preprocess", preprocessor1), ("Ridge", ridge)])

ridge_pipe2 = Pipeline(steps=[("Preprocess", preprocessor2), ("Ridge", ridge)])

param_gridR = {
    "regressor__Ridge__alpha": [193.        , 193.57894737, 194.15789474, 194.73684211,
       195.31578947, 195.89473684, 196.47368421, 197.05263158,
       197.63157895, 198.21052632, 198.78947368, 199.36842105,
       199.94736842, 200.52631579, 201.10526316, 201.68421053,
       202.26315789, 202.84210526, 203.42105263, 204.         ] 
}

ridge_regr1 = compose.TransformedTargetRegressor(regressor= ridge_pipe1,
                                                func=np.log, inverse_func=np.exp)
ridge_regr2 = compose.TransformedTargetRegressor(regressor= ridge_pipe2,
                                                func=np.log, inverse_func=np.exp)

Rsearch1 = GridSearchCV(ridge_regr1, param_gridR, n_jobs=2, cv = kf)

Rsearch2 = GridSearchCV(ridge_regr2, param_gridR, n_jobs=2, cv = kf)

In [195]:
search1.fit(X1, y1)
print(search1.best_score_)
print(search1.best_params_)

0.8846743626007652
{'regressor__Lasso__alpha': 0.001}


In [300]:
search1.fit(X1_, y1_)
print(search1.best_score_)
print(search1.best_params_)

0.8811766190752245
{'regressor__Lasso__alpha': 0.002}


In [211]:
search2.fit(X2, y2)
print(search2.best_score_)
print(search2.best_params_)

0.9344881872898778
{'regressor__Lasso__alpha': 0.00273684}


In [283]:
# this is the group with all the non normal sales dropped AND
# outliers withtin each set of overall quality dropped 
search2.fit(X3, y3)
print(search2.best_score_)
print(search2.best_params_)

0.9427332327956236
{'regressor__Lasso__alpha': 0.002}


In [None]:
0.8825164812004702
0.9308283561200676

In [197]:
Rsearch1.fit(X1, y1)
print(Rsearch1.best_score_)
print(Rsearch1.best_params_)

0.8922082306532779
{'regressor__Ridge__alpha': 100}


In [222]:
Rsearch2.fit(X2, y2)
print(Rsearch2.best_score_)
print(Rsearch2.best_params_)

0.9350065918339736
{'regressor__Ridge__alpha': 196.47368421}


In [282]:
Rsearch2.fit(X3, y3)
print(Rsearch2.best_score_)
print(Rsearch2.best_params_)

0.9437810435256058
{'regressor__Ridge__alpha': 193.0}
