In [32]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [33]:
import pandas as pd
import numpy as np
import matplotlib

import mlxtend
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
from sklearn import preprocessing

import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

from preprocessy.scaling import Scaler
from preprocessy.resampling import Split
import time

from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,classification_report, r2_score
from sklearn.model_selection import train_test_split

from preprocessy.handlenullvalues import NullValuesHandler
from preprocessy.resampling import Split

np.random.seed(101)

In [34]:
melb_data = pd.read_csv('../datasets/handling_null_values/melb_data.csv')
melb_data_copy1 = melb_data
melb_data_copy2 = melb_data

dtf_1 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_2 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_3 = pd.DataFrame(columns = ['Accuracy', 'Time'])

melb_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [35]:
# Consider Price as Target property and others as Predictors 
melb_target = melb_data.Price

melb_predictors = melb_data.drop(['Price'], axis=1)
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
col_names = list(melb_numeric_predictors.columns)
col_names

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

## MinMaxScaler
-  smaller standard deviations through the process

In [36]:
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())

mm_scaler = preprocessing.MinMaxScaler()
df_mm = mm_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_mm = pd.DataFrame(df_mm, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_mm, imputed_df['Price'], test_size=0.3, random_state=69)

print(X_train[:2])
print(X_test[:2])
print(y_train[:5])
print(y_test[:5])

          Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
10872  0.333333  0.565489  0.024565       0.2     0.250  0.2  0.000924   
6603   0.111111  0.106029  0.185261       0.1     0.125  0.1  0.000000   

       BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
10872      0.003404   0.987835   0.403194    0.152420       0.234241  
6603       0.003414   0.965937   0.418658    0.519479       0.193028  
          Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
10059  0.222222  0.218295  0.020471      0.15     0.125  0.1  0.000744   
6844   0.222222  0.037422  0.054248      0.15     0.250  0.1  0.000000   

       BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
10059      0.003414   0.935139   0.504457    0.354880       0.304378  
6844       0.003414   0.935139   0.501214    0.493166       0.305453  
10872     478000.0
6603      555000.0
162       743000.0
7072     1651000.0
7420      455000.0
Name: Price, dtype: float64
10059

In [37]:
params = {"df": melb_data_copy1.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}
target_col = "Price"

NullValuesHandler().execute(params)
params["X"] = params.pop("df")
Split().train_test_split(params)
# this params has X, test_size, train_df, test_df, type
# To get: target_col
params["train_df"] = params.pop('train')
params["test_df"] = params.pop('test')
params["columns"] = list(params["train_df"].columns)
params["target_col"] = target_col
Scaler().execute(params)

# print(params.keys())
y_train = params["train_df"][["Price"]]
X_train = params["train_df"].drop(target_col,axis =1)
X_test = params["test_df"].drop(target_col,axis =1)
y_test = params["test_df"][["Price"]]

print(X_train[:2])
print(X_test[:2])
print(y_train[:5])
print(y_test[:5])

         Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
4074  0.428571  0.565489  0.024565       0.2     0.250  0.2  0.000924   
4075  0.142857  0.106029  0.185261       0.1     0.125  0.1  0.000000   

      BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
4074      0.003404   0.951872   0.403194    0.113268       0.234241  
4075      0.003414   0.855615   0.418658    0.516228       0.193028  
      Rooms  Distance  Postcode  Bedroom2  Bathroom    Car  Landsize  \
0  0.222222  0.221987  0.020471       0.3  0.166667  0.125  0.014828   
1  0.222222  0.038055  0.054248       0.3  0.333333  0.125  0.000000   

   BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
0      0.132955   0.935139   0.549173    0.354880       0.299798  
1      0.132955   0.935139   0.545502    0.493166       0.300880  
          Price
4074   478000.0
4075   555000.0
4076   743000.0
4077  1651000.0
4078   455000.0
       Price
0   518000.0
1  1110000.0
2  1225000.0
3   

In [38]:
def preprocessy_score_dataset(params, target_col):

    # params = {"df": melb_data_copy1.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}

    start = time.time()
    NullValuesHandler().execute(params)
    params["X"] = params.pop("df")
    Split().train_test_split(params)
    # this params has X, test_size, train_df, test_df, type
    # To get: target_col
    params["train_df"] = params.pop('train')
    params["test_df"] = params.pop('test')
    params["columns"] = list(params["train_df"].columns)
    params["target_col"] = target_col
    Scaler().execute(params)

    # print(params.keys())
    y_train = params["train_df"][["Price"]]
    X_train = params["train_df"].drop(target_col,axis =1)
    X_test = params["test_df"].drop(target_col,axis =1)
    y_test = params["test_df"][["Price"]]
    print(y_test.head(5))
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preprocessy_preds = model.predict(X_test)

    end=time.time()
    preprocessy_time = np.round(end - start,4)
    
    preprocessy_accuracy = np.round(r2_score(y_test, preprocessy_preds),4)
    print(preprocessy_preds[:5])
    return preprocessy_accuracy, preprocessy_time

In [39]:
acc, t = preprocessy_score_dataset(params = {"df": melb_data_copy1.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}, target_col = "Price")

dtf_1.loc['Preprocessy'] = [acc, t]

       Price
0   518000.0
1  1110000.0
2  1225000.0
3   730000.0
4   650000.0
[ 729615.   1410490.   2715465.    735468.    904945.17]


In [40]:
dtf_1

Unnamed: 0,Accuracy,Time
Preprocessy,-0.2739,4.8181
