In [1]:
import cupy as np
import cudf as pd
from cuml.preprocessing import StandardScaler
from cuml.model_selection import train_test_split
from cuml.linear_model import LinearRegression
from cuml.metrics.regression import mean_squared_error
from cuml.preprocessing import OneHotEncoder

In [2]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
df_train.head()

In [4]:
df_train.info()

In [5]:
# Selecting Important Features
selected_features = [
    'LotArea',
    'OverallQual',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinType1',
    'BsmtFinSF1',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'BsmtFullBath',
    'FullBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageYrBlt',
    'GarageArea'
]

In [6]:
y = df_train['SalePrice']
df_train = df_train[selected_features]

In [7]:
test_ids = df_test['Id']
df_test = df_test[selected_features]

In [8]:
# Finding missing values in training data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1)
missing_data.rename(columns={0:'Total',1:'Percent'}, inplace=True, axis=1)
missing_data.head(20)

In [9]:
total = df_test.isnull().sum().sort_values(ascending=False)
percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1)
missing_data.rename(columns={0:'Total',1:'Percent'}, inplace=True, axis=1)
missing_data.head(10)

In [10]:
# Imputing missing values in both train and test data
mode1 = df_train['GarageYrBlt'].mode()
mode2 = df_train['BsmtFinType1'].mode()
df_train['GarageYrBlt'].fillna(mode1, inplace=True)
df_train['BsmtFinType1'].fillna(mode2, inplace=True)

df_test['GarageYrBlt'].fillna(mode1, inplace=True)
df_test['BsmtFinType1'].fillna(mode2, inplace=True)
df_test['MasVnrArea'].fillna(df_train['MasVnrArea'].mode(), inplace=True)
df_test['BsmtFullBath'].fillna(df_train['BsmtFullBath'].mode(), inplace=True)
df_test['BsmtFinSF1'].fillna(df_train['BsmtFinSF1'].mode(), inplace=True)
df_test['TotalBsmtSF'].fillna(df_train['TotalBsmtSF'].mode(), inplace=True)
df_test['GarageArea'].fillna(df_train['GarageArea'].mode(), inplace=True)

In [16]:
ohc = OneHotEncoder(sparse=False, handle_unknown='ignore')
x = ohc.fit_transform(df_train)

In [17]:
x_train, x_val, y_train, y_val = train_test_split(np.array(x),np.array(y),random_state=0)

In [18]:
lr_eig = LinearRegression(fit_intercept = True, normalize = False, algorithm = "eig")
lr_svd = LinearRegression(fit_intercept = True, normalize = False, algorithm = "svd")
lr_qr = LinearRegression(fit_intercept = True, normalize = False, algorithm = "qr")
lr_svdqr = LinearRegression(fit_intercept = True, normalize = False, algorithm = "svd-qr")
lr_svdj = LinearRegression(fit_intercept = True, normalize = False, algorithm = "svd-jacobi")

In [19]:
lr_eig.fit(x_train, y_train)
lr_svd.fit(x_train, y_train)
lr_qr.fit(x_train, y_train)
lr_svdqr.fit(x_train, y_train)
lr_svdj.fit(x_train, y_train)

In [20]:
preds_eig = lr_eig.predict(x_val)
preds_svd = lr_svd.predict(x_val)
preds_qr = lr_qr.predict(x_val)
preds_svdqr = lr_svdqr.predict(x_val)
preds_svdj = lr_svdj.predict(x_val)

In [21]:
print('eig: ',mean_squared_error(preds_eig,y_val.astype('float64')))
print('svd: ',mean_squared_error(preds_qr,y_val.astype('float64')))
print('qr: ',mean_squared_error(preds_qr,y_val.astype('float64')))
print('svdqr: ',mean_squared_error(preds_svdqr,y_val.astype('float64')))
print('svdjacobi: ',mean_squared_error(preds_svdj,y_val.astype('float64')))

#### Due to very high number of features svd jacobi is forced

In [22]:
df_train.columns

In [23]:
df_test.columns

In [24]:
x_test = ohc.transform(df_test)

In [25]:
model = LinearRegression()
model.fit(x,y)

In [26]:
predictions = model.predict(x_test)

In [27]:
sub = pd.concat([test_ids,pd.DataFrame(predictions)],axis=1)
sub = sub.rename({0:'SalePrice'},axis=1)
sub.head()

In [28]:
sub.to_csv('submission.csv', index=False)