In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model



%matplotlib inline 

#sets max columns so I can everything
pd.set_option('display.max_columns',50)
pd.set_option('display.width',100)

In [15]:
#load data

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [16]:
df_train['DataType'] = 'Train'
df_test['DataType'] = 'Test'

df_train.drop(labels = 'Id',inplace = True, axis = 1)
df_test.drop(labels = 'Id',inplace = True, axis = 1)

In [17]:
df_master = pd.concat([df_train,df_test])

In [18]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorch

In [19]:
dfCheatSheet = pd.read_csv('DataCheatSheet.csv')

In [20]:
dfDesciptOnly = pd.read_csv('data_descriptions_only.csv',header =0)

In [21]:
dfCheatSheet.head()

Unnamed: 0,Variable,Count,Type
0,MSSubClass,1460,int64
1,MSZoning,1460,object
2,LotFrontage,1201,float64
3,LotArea,1460,int64
4,Street,1460,object


In [22]:
#code for splitting columns into new columns
#df['A'], df['B'] = df['AB'].str.split(' ', 1).str
#https://stackoverflow.com/questions/14745022/how-to-split-a-column-into-two-columns 
dfDesciptOnly['Variable'],dfDesciptOnly['Description'] = dfDesciptOnly['Description'].str.split(':').str

In [23]:
dfDesciptOnly.head()

Unnamed: 0,Description,Variable
0,Identifies the type of dwelling involved in t...,MSSubClass
1,Identifies the general zoning classification ...,MSZoning
2,Linear feet of street connected to property,LotFrontage
3,Lot size in square feet,LotArea
4,Type of road access to property,Street


In [24]:
dfCheatSheet.head()

Unnamed: 0,Variable,Count,Type
0,MSSubClass,1460,int64
1,MSZoning,1460,object
2,LotFrontage,1201,float64
3,LotArea,1460,int64
4,Street,1460,object


In [25]:
#pd.merge(df_new, df_n, on='subject_id')
dfCheatFinal = pd.merge(dfCheatSheet,dfDesciptOnly, on = 'Variable')

In [26]:
dfCheatFinal.head()
#need to figure out how to make the description longer or wrap arround

Unnamed: 0,Variable,Count,Type,Description
0,MSSubClass,1460,int64,Identifies the type of dwelling involved in t...
1,MSZoning,1460,object,Identifies the general zoning classification ...
2,LotFrontage,1201,float64,Linear feet of street connected to property
3,LotArea,1460,int64,Lot size in square feet
4,Street,1460,object,Type of road access to property


In [None]:
numeric_variables = ['LotFrontage', 'LotArea','YearBuilt', 'YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinType2', 'BsmtFinSF2', 
                     'BsmtUnfSF', 'TotalBsmtSF',
        '1stFlrSF', '2ndFlrSF','LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual','TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea','WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch',
        'PoolArea', 'MiscVal', 'MoSold', 'YrSold','SalePrice']

# size of land 'LotArea'
# yearBuilt
# yearRemodAdd
# Square Feet 1stFlrSf, 2ndFlrSF
# bedrooms Bedroom
# full baths BsmtFullBath FullBath
# half baths BsmtHalfBath HalfBath
# totalrooms TotRmsAbvGrd (does not include bathrooms)

In [None]:
#Data Engineering
df_master['TotFullBath'] = (df_master['BsmtFullBath'] + df_master['FullBath'])
df_master['TotHalfBath'] = (df_master['BsmtHalfBath'] + df_master['HalfBath'])
df_master['YrSinceRemod'] = (2011 - df_master['YearRemodAdd'])
df_master['HouseAge'] = (2011 -df_master['YearBuilt'])
df_master['TotSF'] = (df_master['1stFlrSF'] + df_master['2ndFlrSF'])

In [None]:
basic_variables = ['LotArea','HouseAge','YrSinceRemod','TotSF','BedroomAbvGr','TotRmsAbvGrd','SalePrice']

In [None]:
#implement data changes engineering 

df_train = df_master[df_master['DataType']=='Train']
df_test = df_master[df_master['DataType']=='Test']

In [None]:
sns.pairplot(df_train[basic_variables])

In [None]:
df_train[['BedroomAbvGr','SalePrice']].corr()

In [None]:
df_train[['TotRmsAbvGrd','SalePrice']].corr()

In [None]:
df_basic = df_train[basic_variables]
df_basic_test = df_test[basic_variables]

In [None]:
#Machine Learning Classification
x_train = df_basic.drop(labels = 'SalePrice',axis=1)
y_train = df_basic.SalePrice

x_test = df_basic_test.drop(labels = 'SalePrice',axis=1)

In [None]:
#Machine Learning
y = y_train

In [None]:
clf_tree = DecisionTreeClassifier()

In [None]:
clf_tree.fit(x_train,y_train)

In [None]:
pred_tree = clf_tree.predict(x_test)

In [None]:
print(pred_tree)

In [None]:
clf_reg = linear_model.LinearRegression()
clf_reg.fit(x_train,y_train)
pred_reg = clf_reg.predict(x_test)

In [None]:
print(pred_reg)

In [None]:
df_test = pd.read_csv("test.csv")

In [None]:
test_ids = df_test.Id

#submission_reg = {'Id':test_ids,'SalePrice':pred_reg}
sub_tree1 = {'Id':test_ids,'SalePrice':pred_tree}

#sub_reg = pd.DataFrame(submission_reg)
sub_tree = pd.DataFrame(sub_tree1)

In [None]:
#sub_reg.to_csv("Regression.csv",index = False)
sub_tree.to_csv("Tree.csv",index = False)

In [None]:
#https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6