# Housing Data Analysis

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [24]:
housing = pd.read_csv("AmesHousing.tsv", delimiter = "\t")
housing.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [8]:
def transform_features(train):
    # Handling Missing Values
    ## Numerical columns with less than 5% missing data are filled with the group average
    numeric_cols = list(train.select_dtypes(include=['float64', 'int']).columns)
    count_nulls = train[numeric_cols].isnull().sum()
    five_percent = round(len(train)*0.05)
    col_mv_fivep = count_nulls[(count_nulls >= 1) & (count_nulls <= five_percent)].index
    ### setting missing values to group average
    train[col_mv_fivep] = train[col_mv_fivep].fillna(train[col_mv_fivep].mean())
    
    # Creating new useful features
    train["years_until_remod"] = train['Year Remod/Add'] - train['Year Built']
    
    # Dropping features
    train.drop([], axis = 1)
    return train

def select_features():
    return train["Gr Liv Area", "SalePrice"]

def train_and_test(data):
    # Dividing data into train and test
    train = data[:1460]
    test= data[1460:]

    # Fitting linear model
    df_sf = select_features()
    numeric_cols = list(df_sf.select_dtypes(include=['float64', 'int']).columns)
    lr = LinearRegression()
    lr.fit(df_sf[numeric_cols], df_sf["SalePrice"] )
    
    # Calculating RMSE
    predictions = lr.predict(test[numeric_cols])
    mse = mean_squared_error(test["SalePrice"], predictions)
    rmse = np.sqrt(mse)
    
    return rmse
    
    

In [35]:
train = housing[:1460]

numeric_cols = list(housing.select_dtypes(include=['float64', 'int']).columns)

count_nulls = train[numeric_cols].isnull().sum()

five_percent = round(len(train)*0.05)
col_mv_fivep = count_nulls[(count_nulls >= 1) & (count_nulls <= five_percent)].index
    
col_mv_fivep

Index(['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath'],
      dtype='object')

In [38]:
train[col_mv_fivep] = train.loc[:,col_mv_fivep][train[col_mv_fivep].notnull()].mean()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



ValueError: Must have equal len keys and value when setting with an iterable

In [47]:
train.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [48]:
train["years_until_remod"] = train['Year Remod/Add'] - train['Year Built']
train["years_until_remod"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



0        0
1        0
2        0
3        0
4        1
5        0
6        0
7        0
8        1
9        0
10       1
11      15
12       0
13       0
14       0
15       0
16      17
17       0
18       0
19      10
20       0
21       0
22       0
23       0
24      37
25       0
26       0
27       0
28       0
29       0
        ..
1430     0
1431     0
1432    11
1433     1
1434     1
1435     0
1436     0
1437     0
1438     1
1439     1
1440     0
1441     0
1442     1
1443     1
1444     5
1445     0
1446     0
1447     0
1448     0
1449    26
1450     0
1451     0
1452     0
1453     0
1454     0
1455     0
1456     0
1457     1
1458     1
1459     1
Name: years_until_remod, Length: 1460, dtype: int64