In [1]:
import numpy as np
import pandas as pd
import missingno as msn
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("/home/rockbot/Documents/Data_Structure_py/conAI/USA_Housing.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
dtypes: float64(6)
memory usage: 234.5 KB


In [3]:
df.head(10)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5
5,80175.75416,4.988408,6.104512,4.04,26748.42842,1068138.0
6,64698.46343,6.025336,8.14776,3.41,60828.24909,1502056.0
7,78394.33928,6.98978,6.620478,2.42,36516.35897,1573937.0
8,59927.66081,5.362126,6.393121,2.3,29387.396,798869.5
9,81885.92718,4.423672,8.167688,6.1,40149.96575,1545155.0


In [4]:
X = df.iloc[:,:5]
Y = df.iloc[:,5]

In [5]:
StdSc = StandardScaler()
X_scaled = StdSc.fit_transform(X)
print(X_scaled)

[[ 1.02865969 -0.29692705  0.02127433  0.08806222 -1.31759867]
 [ 1.00080775  0.02590164 -0.25550611 -0.72230146  0.40399945]
 [-0.68462915 -0.11230283  1.5162435   0.93084045  0.07240989]
 ...
 [-0.48723454  1.28447022 -2.17026949 -1.50025059 -0.29193658]
 [-0.05459152 -0.44669439  0.14154061  1.18205319  0.65111608]
 [-0.28831272  0.01521477 -0.19434166  0.07185495  1.04162464]]


In [6]:
X_new = np.insert(X_scaled, 0, 1, axis=1)
print(X_new)

[[ 1.          1.02865969 -0.29692705  0.02127433  0.08806222 -1.31759867]
 [ 1.          1.00080775  0.02590164 -0.25550611 -0.72230146  0.40399945]
 [ 1.         -0.68462915 -0.11230283  1.5162435   0.93084045  0.07240989]
 ...
 [ 1.         -0.48723454  1.28447022 -2.17026949 -1.50025059 -0.29193658]
 [ 1.         -0.05459152 -0.44669439  0.14154061  1.18205319  0.65111608]
 [ 1.         -0.28831272  0.01521477 -0.19434166  0.07185495  1.04162464]]


In [7]:
size = len(df.index)
fold_count = 5
chunk_size = size//5
fold_lst = []
total_chunk = list(df.index)
for x in range(fold_count):
    test_lst = np.arange(x*chunk_size,(x+1)*chunk_size)
    train_lst = np.arange(0,x*chunk_size)
    train_lst = np.concatenate((train_lst, np.arange((x+1)*chunk_size, size)))
    fold_lst.append((test_lst,train_lst))
len(fold_lst)

5

In [8]:
def solveBeta(X, Y):
    A = X.T.dot(X)
    B = np.linalg.inv(A)
    C = B.dot(X.T)
    beta = C.dot(Y)
    return beta

In [9]:
def r2_error(Y_predict, Y_test):
    err = Y_predict-Y_test
    sse = np.sum(err**2)
    Y_mean = np.mean(Y_test)
    tot_var = np.sum((Y_test-Y_mean)**2)
    r2e = 1-sse/tot_var
    return r2e

In [10]:
def predict(X, beta):
    Y_predict = X.dot(beta)
    return Y_predict

In [11]:
score_lst = []
beta_lst = []
for test_idx, train_idx in fold_lst:
    X_train,X_test,Y_train,Y_test = X_new[train_idx],X_new[test_idx],Y[train_idx],Y[test_idx]
    beta = solveBeta(X_train,Y_train)
    beta_lst.append(beta)
    Y_predict = predict(X_test, beta)
    R2err = r2_error(Y_predict, Y_test)
    score_lst.append(R2err)
print(score_lst)

[0.9175899480765107, 0.920301549640113, 0.9152429915320014, 0.9208503836977656, 0.9138111758717495]


In [12]:
max_score = max(score_lst)
max_idx = score_lst.index(max_score)
best_beta = beta_lst[max_idx]
print(best_beta)

[1.23144707e+06 2.29921558e+05 1.64523054e+05 1.19737507e+05
 1.12425659e+03 1.51317802e+05]


In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X_new,Y,test_size=0.3,random_state = 41)
Y_predict = predict(X_test, best_beta)
R2err = r2_error(Y_predict, Y_test)
print(R2err)

0.9194842237268729
