## Importing the packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading the dataset

In [2]:
train_dataset = pd.read_csv("./data/train.csv")
test_dataset = pd.read_csv("./data/test.csv")

## Dealing with NaN values

In [3]:
train_dataset.fillna(0,inplace=True)
test_dataset.fillna(0,inplace=True)

In [4]:
X_dataset = train_dataset.iloc[:,1:-1]
y_dataset = train_dataset.iloc[:,-1]

## One-hot encoding of categorical values

In [5]:
X_dataset = pd.get_dummies(X_dataset)

In [6]:
X = X_dataset.values
y = y_dataset.values

## Splitting into train and test sets

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=1)

## Fitting the model

In [8]:
# Model fitting
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Predicting the results

In [9]:
y_pred = regressor.predict(X_test)

In [10]:
print(regressor.score(X_train,y_train))

0.934829005117317


In [11]:
X_train.shape

(1168, 304)

In [12]:
X_dataset.shape

(1460, 304)

## Prediction on new data

In [13]:
X_new_dataset = pd.get_dummies(test_dataset.iloc[:,1:])

In [14]:
X_new_dataset.shape

(1459, 292)

In [15]:
# Get missing columns in the training test
missing_cols = set( X_dataset.columns ) - set( X_new_dataset.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_new_dataset[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_new_dataset = X_new_dataset[X_dataset.columns]

In [16]:
X_new_dataset.shape

(1459, 304)

In [17]:
X_test_set = X_new_dataset.values

In [18]:
y_test_predict = regressor.predict(X_test_set)

## Saving the output to a csv file

In [19]:
output = pd.DataFrame({"Id":test_dataset["Id"],
                     "SalePrice":y_test_predict})

In [20]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,122068.359367
1,1462,160974.196934
2,1463,187884.053401
3,1464,197994.861984
4,1465,208843.318294


In [21]:
output.to_csv("output.csv",index=False)