In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
data = pd.read_csv("HousePrices.csv")
data

Unnamed: 0,Price,SqFt,Bed,Bath,Offers
0,114300,1790,2,2,2.0
1,114200,2030,4,2,3.0
2,114800,1740,3,2,1.0
3,94700,1980,3,2,
4,119800,2130,3,3,3.0
...,...,...,...,...,...
123,119700,1900,3,3,3.0
124,147900,2160,4,3,3.0
125,113500,2070,2,2,
126,149900,2020,3,3,1.0


In [3]:
data.describe()

Unnamed: 0,Price,SqFt,Bed,Bath,Offers
count,128.0,128.0,128.0,128.0,111.0
mean,130427.34375,2000.9375,3.023438,2.445312,2.414414
std,26868.770371,211.572431,0.725951,0.514492,0.995156
min,69100.0,1450.0,2.0,2.0,1.0
25%,111325.0,1880.0,3.0,2.0,2.0
50%,125950.0,2000.0,3.0,2.0,2.0
75%,148250.0,2140.0,3.0,3.0,3.0
max,211200.0,2590.0,5.0,4.0,5.0


In [4]:
data.isnull().sum()

Price      0
SqFt       0
Bed        0
Bath       0
Offers    17
dtype: int64

## Listwise Deletion

In [5]:
data1 = data.dropna()

In [6]:
ols = sm.OLS(data1["Price"], sm.add_constant(data1.iloc[:,1:]))
lm = ols.fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.683
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     57.09
Date:                Sat, 05 Nov 2022   Prob (F-statistic):           1.34e-25
Time:                        15:37:39   Log-Likelihood:                -1220.2
No. Observations:                 111   AIC:                             2450.
Df Residuals:                     106   BIC:                             2464.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -9814.6271   1.31e+04     -0.749      0.4

## Mean Imputation

In [7]:
data2 = pd.read_csv("HousePrices.csv")
data2['Offers'] = data2['Offers'].fillna(data2['Offers'].mean())
data2.isnull().sum()

Price     0
SqFt      0
Bed       0
Bath      0
Offers    0
dtype: int64

In [8]:
data.iloc[8,:]

Price     119200.0
SqFt        2110.0
Bed            4.0
Bath           2.0
Offers         NaN
Name: 8, dtype: float64

In [9]:
data2.iloc[8,:]

Price     119200.000000
SqFt        2110.000000
Bed            4.000000
Bath           2.000000
Offers         2.414414
Name: 8, dtype: float64

In [10]:
ols1 = sm.OLS(data2["Price"], sm.add_constant(data2.iloc[:,1:]))
lm1 = ols1.fit()
print(lm1.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.636
Model:                            OLS   Adj. R-squared:                  0.625
Method:                 Least Squares   F-statistic:                     53.84
Date:                Sat, 05 Nov 2022   Prob (F-statistic):           3.76e-26
Time:                        15:37:39   Log-Likelihood:                -1421.8
No. Observations:                 128   AIC:                             2854.
Df Residuals:                     123   BIC:                             2868.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.175e+04   1.39e+04     -0.844      0.4

## Regression Based Imputation

In [11]:
data_test = data[data['Offers'].isnull()==True]
data_train = data[data['Offers'].isnull()==False]

olsr = sm.OLS(data_train[["Offers"]], sm.add_constant(data_train[["SqFt","Bed", "Bath"]]))
lmr = olsr.fit()

data_test["Offers"]= lmr.predict(sm.add_constant(data_test[["SqFt","Bed", "Bath"]]))
data3 = pd.concat([data_train,data_test])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test["Offers"]= lmr.predict(sm.add_constant(data_test[["SqFt","Bed", "Bath"]]))


In [12]:
ols2 = sm.OLS(data3["Price"], sm.add_constant(data3.iloc[:,1:]))
lm2 = ols2.fit()
print(lm2.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.616
Method:                 Least Squares   F-statistic:                     51.97
Date:                Sat, 05 Nov 2022   Prob (F-statistic):           1.47e-25
Time:                        15:37:39   Log-Likelihood:                -1423.2
No. Observations:                 128   AIC:                             2856.
Df Residuals:                     123   BIC:                             2871.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.563e+04   1.41e+04     -1.107      0.2

## Stochastic Regression Based Imputation

In [13]:
data_test1 = data[data['Offers'].isnull()==True]
data_train1 = data[data['Offers'].isnull()==False]

olss = sm.OLS(data_train1[["Offers"]], sm.add_constant(data_train1[["SqFt","Bed", "Bath"]]))
lms = olss.fit()

mean = lms.predict(sm.add_constant(data_test1[["SqFt","Bed", "Bath"]]))
std = lms.resid.std()
rand_pred = np.random.normal(size = data_test1.shape[0], loc = mean, scale = std)
data_test1["Offers"] = rand_pred
data4 = pd.concat([data_train1,data_test1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test1["Offers"] = rand_pred


In [14]:
data_test1

Unnamed: 0,Price,SqFt,Bed,Bath,Offers
3,94700,1980,3,2,1.799358
8,119200,2110,4,2,1.923565
12,102600,1910,3,2,1.322989
21,113800,2000,3,2,-0.388935
33,139600,2280,5,3,3.351091
34,117800,2000,2,2,4.009231
47,90300,2050,3,2,2.407172
77,176500,2280,4,3,4.113852
89,97800,2010,2,2,3.390598
100,103200,2010,3,2,1.72219


In [15]:
ols3 = sm.OLS(data4["Price"], sm.add_constant(data4.iloc[:,1:]))
lm3 = ols3.fit()
print(lm3.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.585
Model:                            OLS   Adj. R-squared:                  0.571
Method:                 Least Squares   F-statistic:                     43.28
Date:                Sat, 05 Nov 2022   Prob (F-statistic):           1.27e-22
Time:                        15:37:39   Log-Likelihood:                -1430.3
No. Observations:                 128   AIC:                             2871.
Df Residuals:                     123   BIC:                             2885.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.751e+04    1.5e+04     -1.169      0.2

## Multiple Imputation

In [16]:
data_test2 = data[data['Offers'].isnull()==True]
data_train2 = data[data['Offers'].isnull()==False]

olsm = sm.OLS(data_train2[["Offers"]], sm.add_constant(data_train2[["SqFt","Bed", "Bath"]]))
lmm = olsm.fit()

mean = lmm.predict(sm.add_constant(data_test2[["SqFt","Bed", "Bath"]]))
std = lmm.resid.std()

model = []
coef = np.zeros((5,7))
for i in range(0,6):
    rand_pred = np.random.normal(size = data_test2.shape[0], loc = mean, scale = std)
    data_test2["Offers"] = rand_pred
    data4 = pd.concat([data_train2,data_test2])
    lm = sm.OLS(data4["Price"], sm.add_constant(data4.iloc[:,1:])).fit()
    model.append(lm)
    coef[:,i] = lm.params
    
coef.mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test2["Offers"] = rand_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test2["Offers"] = rand_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test2["Offers"] = rand_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

array([-13227.49755753,     46.67119577,   7566.72291637,  13945.35203317,
       -10392.07653573])