In [187]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 100

## Feature engineering

In [188]:
# Creating the datasets
house_data_train = pd.read_csv("train.csv")
house_data_test = pd.read_csv("test.csv")
# Target prices:
y = house_data_train['SalePrice']
house_data_train = house_data_train.drop(columns=['SalePrice'])
house_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [189]:
# Values and meanings from the first column, MSSubClass
MSSubClass_dict = { 
        20:	'1-STORY 1946 & NEWER ALL STYLES', 
        30:	'1-STORY 1945 & OLDER', 
        40:	'1-STORY W/FINISHED ATTIC ALL AGES',
        45:	'1-1/2 STORY - UNFINISHED ALL AGES', 
        50:	'1-1/2 STORY FINISHED ALL AGES', 
        60:	'2-STORY 1946 & NEWER', 
        70:	'2-STORY 1945 & OLDER', 
        75:	'2-1/2 STORY ALL AGES', 
        80:	'SPLIT OR MULTI-LEVEL', 
        85:	'SPLIT FOYER', 
        90:	'DUPLEX - ALL STYLES AND AGES', 
       120:	'1-STORY PUD (Planned Unit Development) - 1946 & NEWER', 
       150:	'1-1/2 STORY PUD - ALL AGES', 
       160:	'2-STORY PUD - 1946 & NEWER', 
       180:	'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER', 
       190:	'2 FAMILY CONVERSION - ALL STYLES AND AGES'}

# Replacing numbers with column names
house_data_train['MSSubClass'].replace(MSSubClass_dict, inplace=True)
house_data_test['MSSubClass'].replace(MSSubClass_dict, inplace=True)

In [190]:
categorical_columns = list(house_data_train.dtypes[house_data_train.dtypes == np.object].index)
numerical_columns = list(house_data_train.dtypes[house_data_train.dtypes != np.object].index)

In [191]:
# Numerical and categorical dataframes
numerical_train = house_data_train[numerical_columns]
numerical_test = house_data_test[numerical_columns]
categorical_train = house_data_train[categorical_columns]
categorical_test = house_data_test[categorical_columns]

In [192]:
# Meanings of the Condition1 and Condition2 column values
cond = {
    'Artery':	'Adjacent to arterial street',
    'Feedr':	'Adjacent to feeder street',	
    'Norm':	'Normal',	
    'RRNn':	'Within 200 of North-South Railroad',
    'RRAn':	'Adjacent to North-South Railroad',
    'PosN':	'Near positive off-site feature--park, greenbelt, etc.',
    'PosA':	'Adjacent to postive off-site feature',
    'RRNe':	'Within 200 of East-West Railroad',
    'RRAe':	'Adjacent to East-West Railroad'
}
# Getting the full name of the conditions
categorical_train['Condition1'].replace(cond, inplace=True)
categorical_train['Condition2'].replace(cond, inplace=True)
categorical_test['Condition1'].replace(cond, inplace=True)
categorical_test['Condition2'].replace(cond, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [193]:
def sum_1hots(df, column1, column2):
    '''This function is for getting only one hot encoder for repeated 
    conditions in some columns in the dataset '''
    c1 = pd.get_dummies(df[column1])
    c2 = pd.get_dummies(df[column2])
    # Sum both one hot encoders. Exploring this data, I found some 2's, that makes 
    # no sense, so, I replaced them with 1's
    c1c2 = c1.add(c2, fill_value=0).replace(2,1)
    df = df.drop(columns=[column1, column2])
    return df, c1c2

In [194]:
categorical_train, cond_col_train = sum_1hots(categorical_train, 'Condition1', 'Condition2')
categorical_train, ext_col_train = sum_1hots(categorical_train, 'Exterior1st', 'Exterior2nd')
categorical_test, cond_col_test = sum_1hots(categorical_test, 'Condition1', 'Condition2')
categorical_test, ext_col_test = sum_1hots(categorical_test, 'Exterior1st', 'Exterior2nd')

### Handling zero values

In [195]:
categorical_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1460 non-null   object
 1   MSZoning       1460 non-null   object
 2   Street         1460 non-null   object
 3   Alley          91 non-null     object
 4   LotShape       1460 non-null   object
 5   LandContour    1460 non-null   object
 6   Utilities      1460 non-null   object
 7   LotConfig      1460 non-null   object
 8   LandSlope      1460 non-null   object
 9   Neighborhood   1460 non-null   object
 10  BldgType       1460 non-null   object
 11  HouseStyle     1460 non-null   object
 12  RoofStyle      1460 non-null   object
 13  RoofMatl       1460 non-null   object
 14  MasVnrType     1452 non-null   object
 15  ExterQual      1460 non-null   object
 16  ExterCond      1460 non-null   object
 17  Foundation     1460 non-null   object
 18  BsmtQual       1423 non-null

In [196]:
# Replacing with a string the nan values. While exploring what kind of nan values we have in the data, 
# is ok to fill with a Not Aplicable condition. (no basement, no pool, etc)
categorical_train.fillna('Not Aplicable', inplace=True)
categorical_test.fillna('Not Aplicable', inplace=True)

In [197]:
# With the numerical columns, we only have one that is special, the 
# year when the garage was built, we fill with the average year
avg_garage_year = numerical_train['GarageYrBlt'].mean()
numerical_train['GarageYrBlt'].fillna(avg_garage_year, inplace=True)
numerical_test['GarageYrBlt'].fillna(avg_garage_year, inplace=True)
# The other values can be 0
numerical_train.fillna(0, inplace=True)
numerical_test.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


### Creating and appending the One Hot Encoders for each categorical column

In [198]:
def drop_concat(df, one_hot, column):
    '''This function accepts a dataframe and a column name. It will get the 
    one hot encoder for categorical column, then, drop it from the original
    dataframe and cancatenate the one hot encoder'''
    df = df.drop(columns=[column])
    df = pd.concat([df, one_hot], axis=1)
    return df

In [199]:
def complete_categorical(df1, df2):
    '''Sometimes there are values not present in the test and train set, and
    in the one hot encoder this result in different number of columns. In 
    order to have control over it, this function completes those columns with 0'''
    cdf1 = df1.columns
    cdf2 = df2.columns
    for not_in_df2 in cdf1.difference(cdf2):
        df2[not_in_df2] = 0
    for not_in_df1 in cdf2.difference(cdf1):
        df1[not_in_df1] = 0
    df1 = df1.reindex(sorted(df1.columns), axis=1)
    df2 = df2.reindex(sorted(df2.columns), axis=1)
    return df1, df2

In [200]:
categorical_train, bsmt_col_train = sum_1hots(categorical_train, 'BsmtFinType1', 'BsmtFinType2')
categorical_test, bsmt_col_test = sum_1hots(categorical_test, 'BsmtFinType1', 'BsmtFinType2')


In [201]:
# Using the custom functions to get completes categorical train and test set
for column in categorical_test.columns:
    OH_train = pd.get_dummies(categorical_train[column])
    OH_test = pd.get_dummies(categorical_test[column])
    OH_train, OH_test = complete_categorical(OH_train, OH_test)
    categorical_train = drop_concat(categorical_train, OH_train, column)
    categorical_test = drop_concat(categorical_test, OH_test, column)

In [202]:
# Appending the sum of one hot encoders created previously and completing missing columns
hot_sums_train = [bsmt_col_train, cond_col_train, ext_col_train]
hot_sums_test = [bsmt_col_test, cond_col_test, ext_col_test]
for i in range(len(hot_sums_train)):
    complete_hot_sums_train, complete_hot_sums_test = complete_categorical(hot_sums_train[i], hot_sums_test[i])
    categorical_train = pd.concat([categorical_train, complete_hot_sums_train], axis=1)
    categorical_test = pd.concat([categorical_test, complete_hot_sums_test], axis=1)

In [206]:
len(categorical_train.columns)

262

In [207]:
len(categorical_test.columns)

262

### Normalizing the numerical data

In [208]:
from sklearn import preprocessing
#Train set
xnumerical_train = numerical_train.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
xnumerical_train_scaled = min_max_scaler.fit_transform(xnumerical_train)

# Test set
xnumerical_test = numerical_test.values #returns a numpy array
xnumerical_test_scaled = min_max_scaler.fit_transform(xnumerical_test)
# Filling NaN values with 0
xnumerical_test_scaled[np.isnan(xnumerical_test_scaled)] = 0

### Getting the whole dataset

In [209]:
processed_train = np.append(xnumerical_train_scaled, categorical_train.values, axis=1)
processed_test = np.append(xnumerical_test_scaled, categorical_test.values, axis=1)

### Correlations 

In [210]:
# norm_train = pd.DataFrame(data = processed_train, columns=categorical_train.columns.append(numerical_train.columns))
# norm_train = pd.concat([norm_train, y], axis=1)
# norm_train

In [211]:
# corr_matrix = norm_train.corr()
# (corr_matrix['SalePrice']
# .sort_values(ascending=False)
# .where(((corr_matrix['SalePrice'] < -0.05) | (corr_matrix['SalePrice'] > 0.05)), other=np.nan)
# .dropna()
# )

## Machine Learning Models

In [212]:
from sklearn import svm

regr = svm.SVR()
regr.fit(processed_train, y.values)
predictions_train = regr.predict(processed_train)
predictions_svm = regr.predict(processed_test)
predictions_svm

array([162963.56851397, 163006.75946828, 163086.07895499, ...,
       162994.69052088, 163010.38607215, 163087.70421863])

In [213]:
np.mean(predictions_train / y.values)

1.0586732478812362

In [214]:
import csv
j = 1461
with open('predictions.csv', 'w') as p:
    writer = csv.writer(p)
    writer.writerow(['Id','SalePrice'])
    for pred in predictions:
        writer.writerow([j,pred])
        j += 1



NameError: name 'predictions' is not defined

In [None]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=5000, learning_rate='adaptive')
sgd.fit(processed_train, y.values)
predictions_sgd_train = sgd.predict(processed_train)
# predictions_sgd = sgd.predict(processed_test)
# predictions_sgd

In [None]:
np.max(predictions_sgd_train / y.values)

In [None]:
import csv
j = 1461
with open('predictions2.csv', 'w') as p:
    writer = csv.writer(p)
    writer.writerow(['Id','SalePrice'])
    for pred in predictions_sgd:
        writer.writerow([j,pred])
        j += 1

In [None]:
categorical_columns = list(house_data_train.dtypes[house_data_train.dtypes == np.object])