In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
categorical_cols = ['MSZoning','Street','LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
    'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
    'SaleCondition']
train_set = pd.read_csv("test.csv")
train_set.dropna()
train_set = pd.get_dummies(train_set, columns = categorical_cols)
test_set = pd.read_csv("test.csv")

In [3]:
print(train_set.columns.values.tolist())

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Gtl', 'LandSlope_Mod', 'LandSlope_Sev', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale

In [4]:
print(train_set)

        Id  MSSubClass  LotFrontage  LotArea Alley  OverallQual  OverallCond  \
0     1461          20         80.0    11622   NaN            5            6   
1     1462          20         81.0    14267   NaN            6            6   
2     1463          60         74.0    13830   NaN            5            5   
3     1464          60         78.0     9978   NaN            6            6   
4     1465         120         43.0     5005   NaN            8            5   
...    ...         ...          ...      ...   ...          ...          ...   
1454  2915         160         21.0     1936   NaN            4            7   
1455  2916         160         21.0     1894   NaN            4            5   
1456  2917          20        160.0    20000   NaN            5            7   
1457  2918          85         62.0    10441   NaN            5            5   
1458  2919          60         74.0     9627   NaN            7            5   

      YearBuilt  YearRemodAdd  MasVnrAr

In [5]:
#Finding correlation between the features
print(train_set.corr())

                             Id  MSSubClass  LotFrontage   LotArea  \
Id                     1.000000    0.004993     0.009930  0.051492   
MSSubClass             0.004993    1.000000    -0.450505 -0.359537   
LotFrontage            0.009930   -0.450505     1.000000  0.644608   
LotArea                0.051492   -0.359537     0.644608  1.000000   
OverallQual           -0.064887    0.034690     0.182870  0.107989   
...                         ...         ...          ...       ...   
SaleCondition_AdjLand  0.080414    0.007814    -0.054958 -0.034914   
SaleCondition_Alloca  -0.019315    0.044657     0.036500  0.059127   
SaleCondition_Family  -0.003666   -0.044474     0.017601  0.075032   
SaleCondition_Normal  -0.115884    0.030260    -0.096869 -0.084070   
SaleCondition_Partial  0.131951   -0.039430     0.100149  0.060139   

                       OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
Id                       -0.064887     0.009553  -0.058647     -0.079437   
MSSubCl

SalePrice is correlated to OverallQual, YearBuilt, YearRemodAdd, TotalBsmtSF, 1stFlrSF, GrLivArea, FullBath, TotRmsAbvGrd, GarageCars, GarageArea (over 0.5 or under -0.5)

In [6]:
# For highly correlated
X = train_set.loc[:, train_set.columns != 'Street'].values
y = train_set.iloc[:, -1].values
print(X)

[[1461 20 80.0 ... 0 1 0]
 [1462 20 81.0 ... 0 1 0]
 [1463 60 74.0 ... 0 1 0]
 ...
 [2917 20 160.0 ... 0 0 0]
 [2918 85 62.0 ... 0 1 0]
 [2919 60 74.0 ... 0 1 0]]


In [7]:
import math

def roundingArray(number_array):
    results = []
    for number in number_array:
        if number % 10 < 5:
            number = int(number / 10) * 10
        else:
            number = int((number + 10) / 10) * 10
        results.append(number)
    return results

In [8]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

linear_regression_model = LinearRegression()

In [9]:
# Predict with highly-correlated set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)
linear_regression_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Pave'

In [None]:
y_pred = linear_regression_model.predict(X_test)
y_pred = roundingArray(y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
# Polynomial Regression Model
from sklearn.preprocessing import PolynomialFeatures

poly_reg=PolynomialFeatures(degree=3)

In [None]:
X_poly = poly_reg.fit_transform(X_train)
poly_reg.fit(X_poly,y_train)

In [None]:
lin_reg2=LinearRegression()
lin_reg2.fit(X_poly,y_train)

In [None]:
y_pred = lin_reg2.predict(poly_reg.fit_transform(X_test))
y_pred = roundingArray(y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')

In [None]:
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
y_pred = roundingArray(y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 50, random_state = 0)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
y_pred = roundingArray(y_pred)

In [None]:
mean_squared_error(y_test, y_pred)