In [None]:
#History
#20230227 try various methods
#20230228 add label encoding and use MultipleLinearRegression

# Importing Necessary Libraries

In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

# Statistics
from scipy import stats
from scipy.stats import norm

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Data Loading

In [None]:
# Loading train data
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
print(f'Shape of train data: {train_df.shape}')
train_df.head()

In [None]:
# Loading test data
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
print(f'Shape of test data: {test_df.shape}')
test_df.head()

In [None]:
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission.head()

In [None]:
# Duplicates Check
duplicate_ids = train_df.Id.duplicated().sum()
print(f'Total Duplicate Ids = {duplicate_ids}')

## Missing Values

In [None]:
# Missing Values Check

missing_train_values = train_df.isnull().sum()
print(missing_train_values[missing_train_values>0])


In [None]:
missing_test_values = test_df.isnull().sum()
print(missing_test_values[missing_test_values>0])

Delete following columns.<br>
["MSZoning", "LotFrontage","Alley","Utilities","Exterior1st","Exterior2nd","MasVnrType","MasVnrArea","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF","Electrical","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","KitchenQual","Functional","FireplaceQu","GarageType","GarageYrBlt","GarageFinish","GarageCars","GarageArea","GarageQual","GarageCond","PoolQC","Fence","MiscFeature","SaleType"]

In [None]:
#Omit colums which includes missing data
train_omitted_df = train_df.drop(["MSZoning", "LotFrontage","Alley","Utilities","Exterior1st","Exterior2nd","MasVnrType","MasVnrArea","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF","Electrical","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","KitchenQual","Functional","FireplaceQu","GarageType","GarageYrBlt","GarageFinish","GarageCars","GarageArea","GarageQual","GarageCond","PoolQC","Fence","MiscFeature","SaleType"], axis=1)
test_omitted_df = test_df.drop(["MSZoning", "LotFrontage","Alley","Utilities","Exterior1st","Exterior2nd","MasVnrType","MasVnrArea","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF","Electrical","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","KitchenQual","Functional","FireplaceQu","GarageType","GarageYrBlt","GarageFinish","GarageCars","GarageArea","GarageQual","GarageCond","PoolQC","Fence","MiscFeature","SaleType"], axis=1)

In [None]:
train_omitted_df.head()

In [None]:
test_omitted_df.head()

In [None]:
missing_train_omitted_values = train_omitted_df.isnull().sum()
print(missing_train_omitted_values[missing_train_omitted_values>0])

In [None]:
missing_test_omitted_values = test_omitted_df.isnull().sum()
print(missing_test_omitted_values[missing_test_omitted_values>0])

## Label Encoding

In [None]:
# pick up
train_obj = train_omitted_df.select_dtypes(include='object')
train_obj.head(3)

In [None]:
test_obj = test_omitted_df.select_dtypes(include='object')
test_obj.head(3)

In [None]:
# 各列のユニークな値の数を df_uni へ格納
train_uni = train_obj.nunique() # nunique()関数でそれぞれの列のユニークな値の数を確認
train_uni

In [None]:
test_uni = test_obj.nunique() # nunique()関数でそれぞれの列のユニークな値の数を確認
test_uni

In [None]:
# ユニークな値の数が２つの入力変数の列名を df_cols に格納
# []の中で条件文を作り、Trueになるインデックスの要素を取り出すことができる。
train_cols = train_uni[train_uni==2].index
train_cols

In [None]:
# df_cols に格納した列へ Label Encoding で 0,1 に変換

train_labelencode_df = train_omitted_df

from sklearn.preprocessing import LabelEncoder # 読み込み

for col in train_cols: # df_cols から列名を一つずつ取り出し
    le = LabelEncoder() # インスタンス化
    le.fit(train_labelencode_df[col]) # ユニークな値の学習
    train_labelencode_df[col] = le.transform(train_labelencode_df[col]) # Label Encoding 適用


In [None]:
# Encoding されているか確認
train_labelencode_df[train_cols].head(2)

In [None]:
# df_cols に格納した列へ Label Encoding で 0,1 に変換

test_labelencode_df = test_omitted_df

for col in train_cols: # df_cols から列名を一つずつ取り出し
    le = LabelEncoder() # インスタンス化
    le.fit(test_labelencode_df[col]) # ユニークな値の学習
    test_labelencode_df[col] = le.transform(test_labelencode_df[col]) # Label Encoding 適用


In [None]:
# Encoding されているか確認
test_labelencode_df[train_cols].head(2)

In [None]:
numonly_train = train_labelencode_df.select_dtypes(include='number')
numonly_test = test_labelencode_df.select_dtypes(include='number')
print(numonly_train.shape)
print(numonly_test.shape)

In [None]:
numonly_train.dtypes

In [None]:
numonly_test.dtypes

In [None]:
numonly_train.head()

In [None]:
numonly_test.head()

In [None]:
#Split data into Explanatory Variable & Response Variable
t_train_test = numonly_train["SalePrice"]
x_train_test = numonly_train.drop("SalePrice", axis=1)
print(t_train_test.shape)
print(x_train_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, t_train, t_test = train_test_split(x_train_test, t_train_test, test_size=0.3, random_state=0)

In [None]:
#define model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

#Try Ridge
from sklearn.linear_model import Ridge
#model = Ridge(alpha=0.5)

#Try Lasso
from sklearn.linear_model import Lasso
#model = Lasso(alpha=0.5)

#Try Elastic
from sklearn.linear_model import ElasticNet
#model = ElasticNet(alpha=0.5)

#Try PLS
from sklearn.cross_decomposition import PLSRegression
#model = PLSRegression(n_components=7)

#fit
model.fit(x_train, t_train)

In [None]:
#check model
print('train score :', model.score(x_train, t_train))
print('test score :', model.score(x_test, t_test))

'''
#LinearRegression:
train score : 0.8155574879687211
test score : 0.7351058165534113
#Ridge:
train score : 0.8155573141223752
test score : 0.735088744970327
#Lasso:
train score : 0.8155574853945742
test score : 0.7351045156467266
#ElasticNet:
train score : 0.810834612882133
test score : 0.7254993642937322
#PLS
train score : 0.8154163702878845
test score : 0.7351558212264049
'''

In [None]:
#Prediction
t_pred = model.predict(numonly_test)

In [None]:
t_pred[:20]

In [None]:
#Submit
sub = sample_submission
sub['SalePrice'] = list(map(int, t_pred))
sub.to_csv("submission.csv", index=False)

In [None]:
# Parameter w
model.coef_

In [None]:
# Parameter graph
plt.bar(x=x_train_test.columns, height=model.coef_);

In [None]:
# bias b
model.intercept_