In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

***
## __Initial Read-in and Dataset Observation__
***

In [None]:
# Options to allow me to see every attribute in this dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read the dataset in
df = pd.read_csv("D:\\School\\Summer_2019\\HousingPriceCompetition\\house-prices-advanced-regression-techniques\\train.csv")

# Display the top values of the dataset
df.head()

# Shape of the design matrix showing that there are 81 attributes (not yet accounting for the fact that one is id) and 1460 observations
print(df.shape)

# Display the data type of each attribute.
# df.dtypes

# Observation indicates several qualitative variables which must be OneHotEncoded in order to be properly studied.
#df.dtypes[[0][0]] # This code allows for extraction of each individual data type.

In [None]:
dfVis = df.select_dtypes(exclude=['object'])
plt.figure(figsize=(24, 18))
sns.heatmap(dfVis.corr(),
            annot = True,
           cmap = 'coolwarm')

In [None]:
# Most correlated variables with saleprice such that they are >=abs(0.5):
# Pos: OverallQual, YearBuilt, YearRemodAdd,TotalBsmtSF, 1stFlrSF, GrLivArea, FullBath, TotRmsAbvGrd, GarageCars, GarageArea

# >= 0.4 <=0.5:
# MasVnrArea, Fireplaces, GarageYrBlt

In [None]:
contVars = np.empty(len(list(dfVis)),dtype=object)
for i in range(len(list(dfVis.corr().iloc[0]))):
    corrWithResponse = dfVis.corr().iloc[i][-1]
    if corrWithResponse > 0.25:
        contVars[i] = list(dfVis)[i]

dfVis.corr().iloc[0][-1]

In [None]:
contVars = contVars[contVars != None]
dfVis = dfVis[contVars]

***
## __Data Preprocessing__
***

In [None]:
df.head()

In [None]:
Qualitative_df = df.select_dtypes(exclude=['int64','float64'])

In [None]:
df = pd.concat([dfVis,Qualitative_df],axis=1)

In [None]:
# Count the number of instances of a particular qualitative variable
df['Street'].value_counts()

In [None]:
# Removes rows for instances where a column has an entry with only 1 observation of a certain
# qualitative type.
for factor in Qualitative_df:
    df = df[df[factor].duplicated(keep=False)]

In [None]:
df.shape

In [None]:
# Drop ID and Utilities because Utilities has not variability.
df = df.drop(['Utilities'],axis=1)

In [None]:
# Create the design matrix and response vector.
X = df.drop(['SalePrice'],axis=1)
y = df['SalePrice']

In [None]:
# OneHotEncoding the qualitative variables as necessary. This is the automatic way. I might need to manually remove some variables later.
X = pd.get_dummies(X,dtype=np.float64)

In [None]:
X.head()

In [None]:
# Counting NaN values because the regression model can't fit and imputing
# X.isna().sum() 
# y.isna().sum()

imputer = Imputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

In [None]:
# Train/test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=55)

In [None]:
y[:3]

***
## __Statistical Modeling__
***

In [None]:
poly = PolynomialFeatures(2)
poly.fit_transform(df[contVars])