In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats import boxcox

In [49]:
# Split Data into numerical data and object data

def Split_DF(df):
    numeric_features = df.dtypes[data.dtypes != "object"].index
    object_features = df.dtypes[data.dtypes == "object"].index
    return df[numeric_features], df[object_features]


In [50]:
#skew => boxcox tranformation ( for numeric data)
def Handle_Skew(df):
    skewed_feats = df.apply(lambda x : skew(x)).sort_values(ascending  = True)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index
    for feat in skewed_features:
        pre = np.array(df[feat])
        pre = np.abs(pre + 1)
        tran_, _ = boxcox(pre)
        df[feat] = tran_
    return df 

In [51]:
# One hot encoding for object data

def one_hot_encoding(df):
    for col in df.columns:
        dummy_columns = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummy_columns], axis=1)
        df.drop(columns=[col], inplace=True)
    col = df.columns
    df[col] = df[col].astype(int)
    return df


In [55]:
def Processing_Data(df):
    # ----- Handle Y ---------#
    objective = df['SalePrice'] 
    log_objective = np.log(objective) # log transform 

    #---------Handle X (feature) boxcox transform and onehotencoding------------#
    numeric_data, object_data = Split_DF(df)
    numeric_data = Handle_Skew(numeric_data)
    object_data = one_hot_encoding(object_data)
    data = pd.concat([numeric_data, object_data], axis = 1)

    #------Fill N/A = 0---------#
    data = data.fillna(0)

    # --------- Drop and add new feature -------#
    data = data.drop(['Id', 'SalePrice'], axis=1)
    data['TotalSF'] = (data['TotalBsmtSF'] 
                       + data['1stFlrSF'] 
                       + data['2ndFlrSF'])
    data['YrBltAndRemod'] = data['YearBuilt'] +data['YearRemodAdd']
    data['Total_sqr_footage'] = (data['BsmtFinSF1'] 
                                 + data['BsmtFinSF2'] 
                                 + data['1stFlrSF'] 
                                 + data['2ndFlrSF']
                                )
    data['Total_Bathrooms'] = (data['FullBath'] 
                               + (0.5 * data['HalfBath']) 
                               + data['BsmtFullBath'] 
                               + (0.5 * data['BsmtHalfBath'])
                              )
    data['Total_porch_sf'] = (data['OpenPorchSF'] 
                              + data['3SsnPorch'] 
                              + data['EnclosedPorch'] 
                              + data['ScreenPorch'] 
                              + data['WoodDeckSF']
                             )
    data['haspool'] = data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    data['has2ndfloor'] = data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    data['hasgarage'] = data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    data['hasbsmt'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    data['hasfireplace'] = data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0) 
    return data , log_objective

In [56]:
data = pd.read_csv('../raw data/RawData.csv')

In [57]:
X, y= Processing_Data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feat] = tran_


In [58]:
# Split Train - Test ( 80 - 20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_, y_,test_size = .2, random_state = 0)

In [59]:
X_train.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\handled data\X_train.csv", index = False)
X_test.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\handled data\X_test.csv", index = False)
y_train.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\handled data\y_train.csv", index = False)
y_test.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\handled data\y_test.csv", index = False)
