In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from  sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score , mean_squared_error


# 1. Housing Dataset

In [None]:
url = 'https://drive.google.com/file/d/1ekP1Q-479fnlf1qzgTm5NAYBERSzA8Vs/view?usp=sharing' # train.csv
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
house = pd.read_csv(path)
house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# 2. Define X and Y

In [None]:
X = house.drop(columns=['Id','SalePrice'])
y = house['SalePrice']

# selecting only numerical features

In [None]:
X = X.select_dtypes(include="number")

# 3. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. To Know any missing Values present in our Dataset

In [None]:
X_train.isna().sum()

MSSubClass         0
LotFrontage      212
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         6
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       58
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [None]:
X_test.isna().sum()

MSSubClass        0
LotFrontage      47
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea        2
BsmtFinSF1        0
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtSF       0
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageYrBlt      23
GarageCars        0
GarageArea        0
WoodDeckSF        0
OpenPorchSF       0
EnclosedPorch     0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MiscVal           0
MoSold            0
YrSold            0
dtype: int64

In [None]:
y.isna().sum()

0

In [None]:
X_train.shape

(1168, 36)

# 5. Automated Approach Using Pipelines: using Decison Tree Regressor
### data preprocessing
5.1 impute missing values in both numerical

5.2. Scaling if required

5.3. VarianceThreshold


### Model Preparation
5.4. Decision Tree Regressor


In [None]:
from sklearn.feature_selection import VarianceThreshold

# model pipeline
pipeline_tree = Pipeline(steps=[
    ('simpleimputer', SimpleImputer(strategy="mean")),
    ('minmaxscaler', MinMaxScaler()),
    ('varaince' , VarianceThreshold(threshold=0.0)),
    ('deciontree_regressor', DecisionTreeRegressor())
   ])


# Fit the pipeline to the training data
pipeline_tree.fit(X_train, y_train)

# 6. Automated Approach Using Pipelines: using KNearestNeighbouRegressor
### data preprocessing
6.1 impute missing values in both numerical

6.2. Scaling if required

6.3. VarianceThreshold


### Model Preparation
6.4. KNearestNeighbourRegressor

In [None]:
# model pipeline
pipeline_knn = Pipeline(steps=[
    ('simpleimputer', SimpleImputer(strategy="mean")),
    ('minmaxscaler', MinMaxScaler()),
    ('varaince' , VarianceThreshold(threshold=0.0)),
    ('knearestneighbor_regressor', KNeighborsRegressor(n_neighbors=10))
   ])


# Fit the pipeline to the training data
pipeline_knn.fit(X_train, y_train)

# 7. Automated Approach Using Pipelines: using LinearRegressor
### data preprocessing
7.1 impute missing values in both numerical

7.2. Scaling if required

7.3. VarianceThreshold


### Model Preparation
7.4. LinearRegressor

In [None]:
# model pipeline
pipeline_lr = Pipeline(steps=[
    ('simpleimputer', SimpleImputer(strategy="mean")),
    ('minmaxscaler', MinMaxScaler()),
    ('varaince' , VarianceThreshold(threshold=0.0)),
    ('linear_regressor', LinearRegression())
   ])


# Fit the pipeline to the training data
pipeline_lr.fit(X_train, y_train)

# 8. Automated Approach Using Pipelines: using RandomForestRegressor
### data preprocessing
7.1 impute missing values in both numerical

7.2. Scaling if required

7.3. VarianceThreshold

### Model Preparation
7.4. RandomForestRegressor

In [None]:
# model pipeline
pipeline_rf = Pipeline(steps=[
    ('simpleimputer', SimpleImputer(strategy="mean")),
    ('minmaxscaler', MinMaxScaler()),
    ('varaince',  VarianceThreshold(threshold=0.0)),
    ('randomforest_regressor', RandomForestRegressor(n_estimators = 30))
   ])


# Fit the pipeline to the training data
pipeline_rf.fit(X_train, y_train)

## calculate r2_score

In [None]:
# Make predictions
y_pred_tree = pipeline_tree.predict(X_test)
y_pred_knn = pipeline_knn.predict(X_test)
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_rf = pipeline_rf.predict(X_test)

In [None]:
performances = pd.DataFrame(columns=["performance_tree", "performance_knn", "performance_linear", "performance_random"])

In [None]:
# Add the first performances of the baseline models to the DataFrame.
performances.loc["PCA", "performance_tree"] = r2_score(y_test, y_pred_tree)
performances.loc["PCA", "performance_knn"] = r2_score(y_test, y_pred_knn)
performances.loc["PCA", "performance_linear"] = r2_score(y_test, y_pred_lr)
performances.loc["PCA", "performance_random"] = r2_score(y_test, y_pred_rf)

performances

Unnamed: 0,performance_tree,performance_knn,performance_linear,performance_random
PCA,0.775779,0.747005,0.634591,0.848835


# challenge

In [None]:
url = 'https://drive.google.com/file/d/1J74X6lVngUWHOc-qIoQsH5qo5jqLoVGq/view?usp=share_link' # train.csv
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

X_test = pd.read_csv(path)

In [None]:
X_test.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [None]:
X_train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [None]:
test_ids = X_test.pop("Id")

In [None]:
X_test = X_test[X_train.columns]

In [None]:
preds = pipeline_rf.predict(X = X_test)

In [None]:
file_name = 'Variancethreshold_randomforest.csv'

In [None]:
Variancethreshold_randomforest = pd.DataFrame({'Id':test_ids,'SalePrice':preds})
Variancethreshold_randomforest.to_csv(file_name,index=False)

In [None]:
Variancethreshold_randomforest

Unnamed: 0,Id,SalePrice
0,1461,128108.333333
1,1462,160211.666667
2,1463,183208.800000
3,1464,175366.666667
4,1465,203393.333333
...,...,...
1454,2915,86563.100000
1455,2916,88656.666667
1456,2917,155031.666667
1457,2918,109833.333333


In [None]:
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>