In [18]:
# python version: python3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# reading and transforming data from csv files to pandas dataframes
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## DATA EXPLORATION & PREPARATION

In [None]:
print("let's see what type of objects we have in our train data")
print(train.get_dtype_counts())
print("###################################################################")
print("Let's check if there are any null or nan values in our train data")
print(train.columns[train.isnull().any()])
print(train.columns[train.isna().any()])
print(train.columns[train.isnull().any()] == train.columns[train.isna().any()])

### part a

In [3]:
train_x_a = train.drop(columns=["SalePrice"])  # drop SalePrice column and select the rest
train_y = train["SalePrice"] # select only SalePrice column

In [4]:
train_x_a.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [5]:
train_y.head(2)

0    208500
1    181500
Name: SalePrice, dtype: int64

### part b

In [6]:
# function "train.columns[train.isnull().any()]" gives a result like Index(...). so, let's get column names as string.
nan_columns = [str(columnName) for columnName in train.columns[train.isnull().any()]] 
train[nan_columns].info() # need to check if train[nan_columns] contains only numerical values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
LotFrontage    827 non-null float64
MasVnrArea     994 non-null float64
dtypes: float64(2)
memory usage: 15.7 KB


In [7]:
# all values in train[nan_columns] are numeric. so, we can use imputer to fill these columns...
train_x_b = train.drop(columns=["SalePrice"])  # re-select features for train_x_b and not using train_x_a directly, to prevent any possible errors and to break necessity of train_x_a 
imputer = Imputer(strategy="median")
imputer.fit(train_x_b[nan_columns])
# now, need to change train_x_b[nan_columns] with its filled version
train_x_b[nan_columns] = pd.DataFrame(imputer.transform(train_x_b[nan_columns]), 
                                      columns=train_x_b[nan_columns].columns, 
                                      index = list(train_x_b.index.values))

In [8]:
print(nan_columns)

['LotFrontage', 'MasVnrArea']


In [9]:
train_x_b.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [10]:
# now, no nan values left in train_x_b
train_x_b.columns[train_x_b.isna().any()]

Index([], dtype='object')

### part c

In [13]:
# Let's see what type of objects we have in our train data
print(list(set(train[columnName].dtypes for columnName in train.columns)))

[dtype('float64'), dtype('int64'), dtype('O')]


In [12]:
# we need columns that only contain non-numeric types, so we exclude numeric ones and select others...
nonnumeric_colnames = train_x_b.select_dtypes(exclude=[np.number]).columns.tolist()
nonnumeric_colnames

['Street']

In [14]:
# only one column we have as non-numeric type
labelEncoder = LabelEncoder()
nonnumeric_cat = train_x_a[nonnumeric_colnames]
nonnumeric_cat_encoded = labelEncoder.fit_transform(nonnumeric_cat.values.ravel())
labelEncoder.classes_

array(['Grvl', 'Pave'], dtype=object)

In [15]:
# that column must be the one that contain categorical values. Let's use it...
categorical_columns = labelEncoder.classes_
print(categorical_columns)

['Grvl' 'Pave']


### part d

In [16]:
oneHotEncoder = OneHotEncoder()
nonnumeric_cat_1hot = oneHotEncoder.fit_transform(nonnumeric_cat_encoded.reshape(-1,1))
nonnumeric_cat_1hot_encoded = pd.DataFrame(data=nonnumeric_cat_1hot.toarray(), 
                                           columns=categorical_columns,
                                           index = list(train_x_b.index.values))
train_x_d = pd.DataFrame.join(train_x_b.drop(columns=nonnumeric_cat), nonnumeric_cat_1hot_encoded)

In [17]:
train_x_d.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,60,65.0,8450,7,5,196.0,706,150,856,856,...,1,3,1,8,0,2,548,2,0.0,1.0
1,20,80.0,9600,6,8,0.0,978,284,1262,1262,...,0,3,1,6,1,2,460,5,0.0,1.0


### part e

In [19]:
num_pipeline = Pipeline([
                         ('std_scaler', StandardScaler()),
                        ])
train_x_e = pd.DataFrame(data=num_pipeline.fit_transform(train_x_d), 
                         columns=train_x_d.columns, 
                         index = list(train_x_d.index.values))


In [20]:
train_x_e.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,0.073872,-0.235351,-0.196474,0.6329,-0.529618,0.464035,0.589782,-0.939672,-0.486827,-0.802481,...,1.230454,0.178216,-0.208547,0.935889,-0.939129,0.316364,0.357524,-1.601265,-0.063372,0.063372
1,-0.873204,0.475965,-0.095659,-0.090414,2.177118,-0.576236,1.204925,-0.638291,0.475863,0.280104,...,-0.773542,0.178216,-0.208547,-0.307817,0.600426,0.316364,-0.063938,-0.485919,-0.063372,0.063372


## LINEAR REGRESSION TO PREDICT HOUSE PRICES

### part f

In [26]:
lr_model = LinearRegression()
lr_model.fit(train_x_e, train_y)
lin_rmse = np.sqrt(mean_squared_error(train_y, lr_model.predict(train_x_e)))
lin_rmse

32249.62077347292

### part g

In [None]:
train_mse_score =
print(train_mse_score)

In [None]:
average_mse = 
print(average_mse)

### part h

In [None]:
print("let's see what type of objects we have in our test data")
print(test.get_dtype_counts())
print("###################################################################")
# Let's check if there are any null or nan values in our test data
print(test.columns[test.isnull().any()])
print(test.columns[test.isna().any()])
print(test.columns[test.isnull().any()] == test.columns[test.isna().any()])

In [None]:
test_x = 
test_y = 

In [None]:
test_x.head(2)

In [None]:
test_y.head(2)

### part i

In [None]:
predicted_values = 

In [None]:
print(predicted_values[10:13])

In [None]:
test_mse_score =
print(test_mse_score)

## CLASSIFICATION MODEL TO PREDICT HOUSE PRICE CATEGORY

### part j

train_y = 
test_y =

In [None]:
train_y.head(2)

In [None]:
test_y.head(2)

### part k

In [None]:
model = 

### part l

In [None]:
accuracy = 
precision = 
recall = 
f1 = 
confusion_matrix =

In [None]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

### part m

In [None]:
predicted_values = 

In [None]:
print(predicted_values[20:23])

In [None]:
accuracy = 
precision = 
recall = 
f1 = 
confusion_matrix =

In [None]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 