In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df.info()
print(f'duplicate rows: {df.duplicated().sum()}')
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
missing_values= df.isnull().sum()/df.isnull().count()
missing_values.sort_values(ascending=False).head(20)

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
MasVnrType      0.597260
FireplaceQu     0.472603
LotFrontage     0.177397
GarageYrBlt     0.055479
GarageCond      0.055479
GarageType      0.055479
GarageFinish    0.055479
GarageQual      0.055479
BsmtFinType2    0.026027
BsmtExposure    0.026027
BsmtQual        0.025342
BsmtCond        0.025342
BsmtFinType1    0.025342
MasVnrArea      0.005479
Electrical      0.000685
Id              0.000000
dtype: float64

In [None]:
dropped= missing_values[missing_values>0].index

In [None]:
def preprocess(df):
        df.drop(dropped, axis=1, inplace=True)
        encoder= LabelEncoder()
        for col in df.columns:
                if df[col].dtype=="O":
                        df[col]= encoder.fit_transform(df[col])
        del(encoder)

In [None]:
preprocess(df)

In [None]:
X= df.drop(['Id', 'SalePrice'], axis=1)
y= df['SalePrice']

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
print(
        f'X_train shape: {X_train.shape}\n'+
        f'y_train shape: {y_train.shape}\n'+
        f'X_test shape:  {X_test.shape}\n'+
        f'y_test shape:  {y_test.shape}'
)

X_train shape: (1168, 60)
y_train shape: (1168,)
X_test shape:  (292, 60)
y_test shape:  (292,)


## Linear Regression Model

In [None]:
model= LinearRegression()
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
print(f'Model Score: {model.score(X_test, y_test)}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean Squared Error:  {mean_squared_error(y_test, y_pred)}')

Model Score: 0.8428724220021332
Mean Absolute Error: 21489.393733093995
Mean Squared Error:  1205219779.3888767


## Output

In [None]:
df_test= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [None]:
preprocess(df_test)

In [None]:
df_test.isna().sum().sort_values(ascending=False).head(20)

BsmtFullBath    2
BsmtHalfBath    2
TotalBsmtSF     1
BsmtUnfSF       1
BsmtFinSF2      1
BsmtFinSF1      1
GarageCars      1
GarageArea      1
HalfBath        0
KitchenAbvGr    0
BedroomAbvGr    0
Id              0
FullBath        0
TotRmsAbvGrd    0
GrLivArea       0
LowQualFinSF    0
KitchenQual     0
Functional      0
1stFlrSF        0
Fireplaces      0
dtype: int64

In [None]:
imputer= SimpleImputer()
X_new= imputer.fit_transform(df_test.drop(['Id'], axis=1))

In [None]:
y_pred= model.predict(X_new)

print(f'number of rows: {len(y_pred)}')
y_pred

number of rows: 1459




array([113369.5052003 , 161708.07748291, 168286.76898218, ...,
       164626.49966942, 101074.22375213, 247366.15251729])

In [None]:
submission= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int64  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB


In [None]:
submission['SalePrice']= y_pred

In [None]:
submission.to_csv('submission.csv', index=False)