In [81]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder




In [82]:
# Global variable area
model_performance_list = []


In [83]:
# There are three ways we can handle catagorical variables in data science
# 1. Drop the catagorical column
# 2. Use Ordinal Encoding 
# 3. Use One-Hot-Encoding

def checkNull(dataset):
    isNull = dataset.isnull().any()
    flag = False
    for x in isNull:
        if x == True:
            flag = True
    if flag == True:
        print("This data set contains null values")
    else:
        print("Data set is clear and ready to work with")

def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=10,random_state = 0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    
    return mean_absolute_error(y_valid, preds)

def drop_cata_variables(X_train,x_valid):
    X_train_removed_cata = X_train.select_dtypes(exclude = 'object')
    x_back = X_train.drop([col for col in X_train.columns if X_train[col].dtype == 'object'],axis = 1)
    X_valid_removed_cata = x_valid.select_dtypes(exclude= 'object')
    
#     print("Officialy valid style : ",X_train_removed_cata.head(3))
#     print("The style that i've created ",x_back.head(3))
    print("MAE from Approach 1 (Drop categorical variables):")
    print(score_dataset(X_train_removed_cata,X_valid_removed_cata, y_train, y_valid))
    model_performance_list.append(score_dataset(X_train_removed_cata,X_valid_removed_cata,y_train, y_valid))
    
def ordinal_style(X_train,x_valid,cata_cols):
    x_train_new = X_train.copy()
    x_valid_new = x_valid.copy()
    
    ordinal_obj = OrdinalEncoder()
    
    # Apply ordinal encoder to each column with catagorical data
    x_train_new[cata_cols] = ordinal_obj.fit_transform(x_train_new[cata_cols])
    x_valid_new[cata_cols] = ordinal_obj.transform(x_valid_new[cata_cols])
        
    print("MAE from Approach 2 (Ordinal Encoding):") 
    print(score_dataset(x_train_new, x_valid_new, y_train, y_valid))
    
    model_performance_list.append(score_dataset(x_train_new,x_valid_new,y_train,y_valid))
    

def one_hot_encoding_style(X_train,x_valid,cata_cols):
    oh_encoder = OneHotEncoder(handle_unknown = 'ignore',sparse = False)
    
    oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[cata_cols]))
    oh_cols_valid = pd.DataFrame(oh_encoder.transform(x_valid[cata_cols]))
    
    # One-hot encoding removed index; put it back
    oh_cols_train.index = X_train.index
    oh_cols_valid.index = x_valid.index
    
    # Remove catagorical columns(will replace with one-hot-encoding)
    num_x_train = X_train.drop(cata_cols,axis = 1)
    num_x_valid = x_valid.drop(cata_cols,axis = 1)    
    
    # Adding newly generated columns and their data
    final_x_train = pd.concat([num_x_train,oh_cols_train],axis = 1)
    final_x_valid = pd.concat([num_x_valid,oh_cols_valid],axis = 1)
    
    print("MAE from Approach 3 (One-Hot Encoding):") 
    print(score_dataset(final_x_train,final_x_valid, y_train, y_valid))
    
    model_performance_list.append(score_dataset(final_x_train,final_x_valid,y_train,y_valid))

In [84]:
if __name__ == "__main__":
    data = pd.read_csv('melb_data.csv')

    y = data.Price
    X = data.drop(['Price'],axis = 1)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)
    
    cols_with_missing = [col for col in X_train if X_train[col].isnull().any()]
    
    # Droping the missing data using the first method that I've learned in missing data handling tutorial
    x_train_removed = X_train.drop(cols_with_missing,axis = 1)
    x_valid_removed = X_valid.drop(cols_with_missing,axis = 1)
    
    # 'Cardinality' means the number of unique values in columns
    # Select catagorical columns with relativel low cardiality (convienient but arbitary)
    
    low_cardinal_cols = [col for col in x_train_removed.columns if x_train_removed[col].nunique() < 10 and x_train_removed[col].dtype == 'object']
        
    # select the numerical columns
    numerical_cols = [col for col in x_train_removed.columns if x_train_removed[col].dtype in ['int64','float64']]
    
    # keep selected columns only
    my_cols = low_cardinal_cols + numerical_cols
    
    # reassing the X_train and X_valid var
    X_train = x_train_removed[my_cols].copy()
    X_valid = x_valid_removed[my_cols].copy()
    
    cata_cols_list = [col for col in X_train if X_train[col].dtype == 'object']
    # another way of doing the same thing would be like this
#     cata_cols_list_in_different_way = (X_train.dtypes == 'object')
#     cols = list(cata_cols_list_in_different_way[cata_cols_list_in_different_way].index)
    
    drop_cata_variables(X_train,X_valid)
    ordinal_style(X_train,X_valid,cata_cols_list)
    one_hot_encoding_style(X_train,X_valid,cata_cols_list)
    
    print(model_performance_list)
    print("The most efficient result is :  ",min(model_performance_list))
    model_performance_list.clear()

MAE from Approach 1 (Drop categorical variables):
183550.22137772635
MAE from Approach 2 (Ordinal Encoding):
175062.2967599411
MAE from Approach 3 (One-Hot Encoding):
176703.63810751104
[183550.22137772635, 175062.2967599411, 176703.63810751104]
The most efficient result is :   175062.2967599411
