# importing modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

plt.style.use("fivethirtyeight")

# importing data

In [2]:
data = pd.read_csv("house_pred.csv")
test = pd.read_csv("test.csv")
test.drop("Id",axis = 1,inplace = True)
data.drop("Id",axis = 1,inplace=True)
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


# data preprocessing

In [3]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1].values
cleaned_data = x
test_clean_data = test

### cleaning data

#### if 75 percent of the rows of column are null drop it

In [4]:
def drop_columns_with_all_null(dataset):
    null_threshold = len(dataset) * 0.75
    cleaned_dataset = dataset.dropna(axis=1, thresh=null_threshold)
    return cleaned_dataset

In [5]:
cleaned_data = drop_columns_with_all_null(data)
cleaned_data

test_clean_data = drop_columns_with_all_null(test_clean_data)

### finding if any null values exist in that column

In [6]:
def print_null_counts(dataset):
    for column in dataset.columns:
        null_count = dataset[column].isnull().sum()
        print(f"Column '{column}': {null_count} null values")

def number_of_columns_with_atleast_1_null_values(dataset):
    columns_with_null_values = dataset.columns[dataset.isnull().any()].tolist()
    return columns_with_null_values


In [None]:
print_null_counts(cleaned_data)
print(f"number of columns with null values:{number_of_columns_with_atleast_1_null_values(cleaned_data)}")

Column 'MSSubClass': 0 null values
Column 'MSZoning': 0 null values
Column 'LotFrontage': 259 null values
Column 'LotArea': 0 null values
Column 'Street': 0 null values
Column 'LotShape': 0 null values
Column 'LandContour': 0 null values
Column 'Utilities': 0 null values
Column 'LotConfig': 0 null values
Column 'LandSlope': 0 null values
Column 'Neighborhood': 0 null values
Column 'Condition1': 0 null values
Column 'Condition2': 0 null values
Column 'BldgType': 0 null values
Column 'HouseStyle': 0 null values
Column 'OverallQual': 0 null values
Column 'OverallCond': 0 null values
Column 'YearBuilt': 0 null values
Column 'YearRemodAdd': 0 null values
Column 'RoofStyle': 0 null values
Column 'RoofMatl': 0 null values
Column 'Exterior1st': 0 null values
Column 'Exterior2nd': 0 null values
Column 'MasVnrArea': 8 null values
Column 'ExterQual': 0 null values
Column 'ExterCond': 0 null values
Column 'Foundation': 0 null values
Column 'BsmtQual': 37 null values
Column 'BsmtCond': 37 null valu

### replacing null values by mode of that column

In [8]:
def replace_null_with_mode(dataset):
    fixed_dataset = dataset.copy()

    for column in dataset.columns:
        mode_value = dataset[column].mode()[0]
        fixed_dataset[column].fillna(mode_value, inplace=True)

    return fixed_dataset



In [9]:
cleaned_data = replace_null_with_mode(cleaned_data)
test_clean_data = replace_null_with_mode(test_clean_data)
cleaned_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,112,0,0,0,0,4,2010,WD,Normal,142125


In [10]:
print(f"number of columns with null values:{number_of_columns_with_atleast_1_null_values(cleaned_data)}")

number of columns with null values:[]


### encoding the categorical data

In [11]:
def encode_categorical_columns(dataset):
    categorical_columns = dataset.select_dtypes(include=['object']).columns
    encoded_dataset = pd.get_dummies(dataset, columns=categorical_columns,dtype=int)
    return encoded_dataset

def remove_categorical_columns(dataset):
    non_categorical_columns = dataset.select_dtypes(exclude=['object']).columns
    new_dataset = dataset[non_categorical_columns]
    return new_dataset


In [12]:
#creating a dataset that has all columns exceot the ones with categorical data
non_categorical_columns = remove_categorical_columns(cleaned_data)
non_categorical_columns
non_categorical_columns.drop("SalePrice",axis = 1,inplace = True)

non_categorical_columns_test = remove_categorical_columns(test_clean_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_categorical_columns.drop("SalePrice",axis = 1,inplace = True)


In [13]:
cleaned_data = encode_categorical_columns(cleaned_data)
cleaned_data

test_clean_data = encode_categorical_columns(test_clean_data)

test_clean_data


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,PavedDrive_Y,SaleType_COD,SaleType_CWD,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,78.0,10140,5,6,1975,1975,0.0,788,0,...,1,0,0,0,1,0,0,0,1,0
1,20,90.0,14684,7,7,1990,1991,234.0,485,177,...,1,0,0,0,1,0,0,0,1,0
2,20,60.0,8900,4,4,1966,1966,0.0,1056,0,...,1,0,0,0,1,0,0,0,1,0
3,20,70.0,9135,6,5,2003,2003,120.0,340,0,...,1,0,0,0,1,0,0,0,1,0
4,20,70.0,7763,5,7,1962,1980,0.0,504,108,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,1,0,0,0,1,0,0,0,1,0
252,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,1,0,0,0,1,0,0,0,1,0
253,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,1,0,0,0,1,0,0,0,1,0
254,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,1,0,0,0,1,0,0,0,1,0


# calculating collinearity / variance inflation factor in the data

In [14]:
def calculate_vif(dataset, vif_threshold=3):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = dataset.columns

    try:
        vif_values = [variance_inflation_factor(dataset.values, i) for i in range(dataset.shape[1])]
        vif_values = np.where(np.isinf(vif_values), vif_threshold, vif_values)
        vif_data["VIF"] = vif_values
    except Exception as e:
        print(f"Error calculating VIF: {e}")
        vif_data["VIF"] = vif_threshold

    return vif_data

In [15]:
vif_data = calculate_vif(non_categorical_columns)
vif_data

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Variable,VIF
0,MSSubClass,4.625454
1,LotFrontage,15.736155
2,LotArea,2.629592
3,OverallQual,66.617715
4,OverallCond,41.835246
5,YearBuilt,19960.417762
6,YearRemodAdd,22352.188625
7,MasVnrArea,1.843036
8,BsmtFinSF1,3.0
9,BsmtFinSF2,3.0


### removing the columns with vif > 3

In [16]:
def remove_high_vif_columns(dataset, vif_data, threshold=3):
    high_vif_columns = vif_data[vif_data['VIF'] > threshold]['Variable'].tolist()
    new_dataset = dataset.drop(columns=high_vif_columns, axis=1)
    return new_dataset


In [17]:
final_data = remove_high_vif_columns(cleaned_data,vif_data)
final_test = remove_high_vif_columns(test_clean_data,vif_data)

In [18]:
final_data

Unnamed: 0,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,8450,196.0,706,0,150,856,856,854,0,1710,...,0,0,0,1,0,0,0,0,1,0
1,9600,0.0,978,0,284,1262,1262,0,0,1262,...,0,0,0,1,0,0,0,0,1,0
2,11250,162.0,486,0,434,920,920,866,0,1786,...,0,0,0,1,0,0,0,0,1,0
3,9550,0.0,216,0,540,756,961,756,0,1717,...,0,0,0,1,1,0,0,0,0,0
4,14260,350.0,655,0,490,1145,1145,1053,0,2198,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,0.0,0,0,953,953,953,694,0,1647,...,0,0,0,1,0,0,0,0,1,0
1456,13175,119.0,790,163,589,1542,2073,0,0,2073,...,0,0,0,1,0,0,0,0,1,0
1457,9042,0.0,275,0,877,1152,1188,1152,0,2340,...,0,0,0,1,0,0,0,0,1,0
1458,9717,0.0,49,1029,0,1078,1078,0,0,1078,...,0,0,0,1,0,0,0,0,1,0


# building the function to find the coefficents

In [19]:
#imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### training the model

In [20]:
model = LinearRegression()

In [21]:
def select_common_columns_from_b(a, b):
    common_columns = set(a.columns) & set(b.columns)
    
    result_dataset = b[list(common_columns)].copy()
    
    return result_dataset


final_data = select_common_columns_from_b(final_test,final_data)
final_test = select_common_columns_from_b(final_data,final_test)
x = final_data
y = y

model.fit(x,y)

### model on test data

In [22]:
y_test_pred = model.predict(final_test)

In [27]:
y_test_pred.reshape(len(y_test_pred),1)

array([[146566.68269379],
       [258835.63057941],
       [146958.97316295],
       [222258.35754644],
       [135864.91715165],
       [308594.05016918],
       [209388.16955054],
       [171168.53235136],
       [ 79689.4271304 ],
       [148088.14126107],
       [134491.0892123 ],
       [110637.73490576],
       [106858.90791435],
       [232582.7524102 ],
       [ 78186.01739367],
       [ 97363.67161486],
       [ 90809.70395042],
       [123901.54743893],
       [165709.61030655],
       [167305.98524125],
       [197503.66716557],
       [147190.19787245],
       [235550.02988823],
       [144062.95754045],
       [359689.05524005],
       [153541.30184565],
       [190000.00000346],
       [118274.19446399],
       [ 82648.39668221],
       [132549.46556829],
       [146913.61152883],
       [177567.55480577],
       [173318.22862034],
       [204507.28040649],
       [126799.01626113],
       [245533.80295341],
       [207244.91992083],
       [242842.28274166],
       [1672