In [None]:
%pip install -q kaggle

In [None]:
!mkdir -p ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
# download the dataset
! kaggle competitions download -c house-prices-advanced-regression-techniques

In [None]:
# unzip the dataset
!unzip /content/house-prices-advanced-regression-techniques.zip

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# Data Exploration

In [None]:

data = pd.read_csv('/content/train.csv', index_col='Id')

In [None]:

print('data frame shape: ', data.shape)
print("the data frame contains %2d rows, and %d columns (attributes)" % (data.shape[0], data.shape[1]))

In [None]:

data.describe()

In [None]:

data.head()

# Data Cleaning

In [None]:

print(data.dtypes.to_string())

In [None]:

num_data = data.select_dtypes(exclude=['object'])

In [None]:

print(num_data.dtypes.to_string())

In [None]:

cols_with_nans = num_data.isnull().sum()
print("number of NaN values for the training data frame :")
print(cols_with_nans[cols_with_nans>0])

In [None]:

clean_data = num_data.fillna(num_data.mean())

In [None]:

cols_with_nans = clean_data.isnull().sum()
print("number of NaN values for the training data frame :")
print(cols_with_nans[cols_with_nans>0])

In [None]:

print('the shape of the data: ', clean_data.shape)
print('the data frame contains %d rows, and %d columns (attributes)' % (clean_data.shape[0], clean_data.shape[1]))

# Exploratory Data Analysis

In [None]:
def plot_reg(x_var, y_var, DataFrame):
    #
    sns.regplot(x = x_var,
                y = y_var,
                data = DataFrame)

    plt.show()

In [None]:

att_names = clean_data.columns.tolist()
print(att_names)

In [None]:

plot_reg('OverallQual', 'SalePrice', clean_data)

In [None]:

plot_reg('YearBuilt', 'SalePrice', clean_data)

In [None]:

plot_reg('LotFrontage', 'SalePrice', clean_data)

# Data Splitting

In [None]:

columns = clean_data.columns
features_names = columns[columns != 'SalePrice']
features = clean_data[features_names]
target = clean_data['SalePrice']

In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Data Preprocessing

In [None]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:

scaler = MinMaxScaler(feature_range=(0, 1))


rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test  = scaler.transform(X_test)

# Linear Regression Model

In [None]:

lr_model = LinearRegression()

In [None]:

lr_model.fit(rescaledX_train, Y_train)

In [None]:

lr_predictions = lr_model.predict(rescaledX_test)

# Scores and Results

In [None]:

LinearRegression_SCR  = lr_model.score(rescaledX_test, Y_test)
LinearRegression_MAE  = mean_absolute_error(Y_test, lr_predictions)
LinearRegression_MSE  = mean_squared_error(Y_test, lr_predictions)
LinearRegression_RMSE = np.sqrt(mean_squared_error(Y_test, lr_predictions))
LinearRegression_R2   = r2_score(Y_test, lr_predictions)

In [None]:

Report = pd.DataFrame({'Metric': ['Score', 'MAE', 'MSE', 'RMSE','R^2'],
        'Value': [LinearRegression_SCR, LinearRegression_MAE, LinearRegression_MSE,
                  LinearRegression_RMSE, LinearRegression_R2]})
Report

In [None]:

plt.figure(figsize=(10, 6))
plt.scatter(Y_test, lr_predictions, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title(f" Actual Prices vs Predicted Prices")
plt.show()