In [1]:
import numpy as np #import biblioteki do obliczeń na macierzach (i tensorach)
import pandas as pd #import biblioteki do przetwarzania ramek z danymi
import matplotlib.pyplot as plt

In [2]:
def data_processing(data,names_of_x,name_of_y,number_of_parts=5,seed=123):
    data = data.sample(data.shape[0],random_state=seed) #losowa zmiana kolejności wierszy
    dataX = np.array(data.loc[:,names_of_x]) #wybranie odpowiednich kolumn
    dataX = np.c_[np.ones(dataX.shape[0]),dataX] #dodanie kolumny jedynek
    dataY = np.array(data.loc[:,name_of_y]) #wektor wartości
    
    #listy z danymi i wartościami cen
    trainX = [0]*number_of_parts
    trainY = [0]*number_of_parts
    cvX = [0]*number_of_parts
    cvY = [0]*number_of_parts
    for i in range(number_of_parts):
        trainX[i] = np.r_[dataX[0:int(data.shape[0]*i/number_of_parts),:],
                      dataX[int(data.shape[0]*(i+1)/number_of_parts):data.shape[0],:]]
        trainY[i] = np.r_[dataY[0:int(data.shape[0]*i/number_of_parts)],
                      dataY[int(data.shape[0]*(i+1)/number_of_parts):data.shape[0]]]
        cvX[i] = dataX[int(data.shape[0]*i/number_of_parts):int(data.shape[0]*(i+1)/number_of_parts),:]
        cvY[i] = dataY[int(data.shape[0]*i/number_of_parts):int(data.shape[0]*(i+1)/number_of_parts)]
    return(trainX, trainY, cvX, cvY)

def square_mean(X, Y, model):
    return np.mean((X @ model.transpose() - Y) **2) ** (1/2)

def linear_regression(trainX, trainY, cvX, cvY):
    model = np.linalg.inv(trainX.transpose() @ trainX) @ trainX.transpose() @ trainY
    return square_mean(cvX,cvY,model)

def training(trainX, trainY, cvX, cvY, number_of_parts=5):
    error = 0
    for i in range(len(trainX)):
        error += linear_regression(trainX[i], trainY[i], cvX[i], cvY[i])
    return error/number_of_parts

def mean_error(data,names_of_x,name_of_y,number_of_parts=5,seed=123,number_of_repeats=100):
    error = 0
    for i in range(number_of_repeats):
        (trainX, trainY, cvX, cvY) = data_processing(train,names_of_x,name_of_y,number_of_parts,seed+i)
        error += training(trainX, trainY, cvX, cvY)
    return error/number_of_repeats

In [3]:
#train = pd.read_csv('train.csv')
#mean_error(train,('LotArea'),'SalePrice')
def training2(trainX, trainY, cvX, cvY, x_names, number_of_parts=5):
    error = 0
    for i in range(len(trainX)):
        error += linear_regression(trainX[i][:,x_names], trainY[i], cvX[i][:,x_names], cvY[i])
    return error/number_of_parts

def find_features(data,names_of_x,name_of_y,number_of_parts=5,seed=123,number_of_repeats=100):
    if type(names_of_x)==str: m = 1
    else: m = len(names_of_x)
    errors = [0]*2**m
    for i in range(number_of_repeats):
        (trainX, trainY, cvX, cvY) = data_processing(train,names_of_x,name_of_y,number_of_parts,seed+i)
        for j in range(1,2**m):
            x_indices = [] #which names we use
            true_false_list = ('{0:0'+str(m)+'b}').format(j) #the bollean string which names we should add to x_indices
            for k in range(m):
                if true_false_list[k] == '1': x_indices.append(k)
            errors[j] += training2(trainX, trainY, cvX, cvY, x_indices, number_of_parts)/number_of_repeats
    return errors

In [5]:
def CV_division(dataX,dataY,number_of_parts=5):
    trainX = [0]*number_of_parts
    trainY = [0]*number_of_parts
    cvX = [0]*number_of_parts
    cvY = [0]*number_of_parts
    for i in range(number_of_parts):
        trainX[i] = np.r_[dataX[0:int(data.shape[0]*i/number_of_parts),:],
                      dataX[int(data.shape[0]*(i+1)/number_of_parts):data.shape[0],:]]
        trainY[i] = np.r_[dataY[0:int(data.shape[0]*i/number_of_parts)],
                      dataY[int(data.shape[0]*(i+1)/number_of_parts):data.shape[0]]]
        cvX[i] = dataX[int(data.shape[0]*i/number_of_parts):int(data.shape[0]*(i+1)/number_of_parts),:]
        cvY[i] = dataY[int(data.shape[0]*i/number_of_parts):int(data.shape[0]*(i+1)/number_of_parts)]
    return(trainX, trainY, cvX, cvY)

def training3(trainX, trainY, cvX, cvY, column_to_throw, number_of_parts=5):
    error = 0
    for i in range(len(trainX)):
        error += linear_regression(np.delete(trainX[i],column_to_throw,1), trainY[i], np.delete(cvX[i],column_to_throw,1), cvY[i])
    return error/number_of_parts

def SBS(dataX,dataY,depth,number_of_parts=5):
    m = dataX.shape[1]
    (trainX, trainY, cvX, cvY) = CV_division(dataX,dataY,number_of_parts)
    error = 0
    for i in range(m-depth):
        cur_error = np.inf
        smallest_error = np.inf
        column_to_throw = 0
        for j in range(m-i):
            cur_error = training3(trainX, trainY, cvX, cvY, j)
            if cur_error<smallest_error:
                smallest_error = cur_error
                column_to_throw = j
        for j in range(number_of_parts):
            trainX[i] = np.delete(trainX[i],column_to_throw,1)
            cvX = np.delete(cvX[i],column_to_throw,1)
        error = smallest_error
    return(error)

In [6]:
train = pd.read_csv('train.csv')
errors = find_features(train,('LotArea','YearBuilt','GrLivArea','OverallQual','TotalBsmtSF'),'SalePrice')
pd.read_csv('train.csv').sort_values(by='SalePrice',ascending=False).reset_index(drop=True).columns
errors

[0,
 52891.001562095444,
 56275.450579223514,
 48290.429637995716,
 77849.91762816977,
 48747.5321111652,
 55967.27768341839,
 42977.34910715674,
 136815.42328590088,
 51747.88679175552,
 55923.08341716125,
 47959.25649752419,
 75734.41933906451,
 46922.19893994372,
 55706.106397074676,
 42189.5406177284,
 79270.14059498113,
 48555.35032731712,
 56141.15998489189,
 42585.6693225605,
 67570.08884419451,
 48115.36138744172,
 46822.122279740615,
 40837.14531846107,
 77069.54600452716,
 46688.18024092561,
 55869.53615871734,
 41772.69228697753,
 65030.81113718823,
 46110.10900623327,
 46289.14603044514,
 39968.121718407296]

### Przydzielone zadania
- Wojtek Kretowicz - implementation using scikit learn
- Karol Pysiak - remove Nas (must have), one-hot-encoding for strings (colud have)
- Tomek Makowski - class based on the function here