# Scaling Target: Meter Reading

To do:
- Need to store the scaling parameters such that you can scale back later

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import time

In [2]:
df = pd.read_pickle('mergedData.pkl')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,site_id,timestamp,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,set
0,1,0,2016-01-01 00:00:00,0,0,0.0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
1,2,0,2016-01-01 00:00:00,1,0,0.0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
2,3,0,2016-01-01 00:00:00,2,0,0.0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
3,4,0,2016-01-01 00:00:00,3,0,0.0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
4,5,0,2016-01-01 00:00:00,4,0,0.0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train


# Normalizing for every building and meter: Main Functions

In [4]:
# Timing function decorator
def timeit(func):
    def wrapper(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print('{} took {:.2f} milliseconds'.format(func.__name__,(end-start)*1000))

        return result
    
    return wrapper

In [5]:
# Scaling function per dataset grouped on building and meter
def normalize_target(df_b_m, scaler):
    """
    Input:
    ------
    df_b_m: pd.dataframe
        original data grouped by 'building' and 'meter'
    scaler: sklearn.preprocessing.func()
        any of the sklearn scalers
    
    Output:
    -------
    pd.Series
    scaled values of 'meter_reading'
    """
    
    scaled_mat = scaler.fit_transform(df_b_m.loc[:,'meter_reading'].values.reshape(-1, 1))
    scaled_col = scaled_mat.reshape((len(scaled_mat),))
    
    return pd.Series(data = scaled_col, index = df_b_m.index)

In [6]:
# Adding new column to data
@timeit
def get_normalized_data(data, scaler):
    result = data.groupby(['meter', 'building_id']).apply(normalize_target, scaler=scaler)
    result = result.reset_index()
    result.columns = ['meter', 'building_id', 'index', 'meter_reading_scaled']
    result = result.set_index('index')
    data.loc[:,'meter_reading_scaled'] = result.loc[:,'meter_reading_scaled'].copy()
    
    return data

#### Explanation of scaling function

In [7]:
# Follow along with this code
df_b_m = df[(df['meter'] == 0) & (df['building_id'] == 12)]
df_b_m.head()

Unnamed: 0.1,Unnamed: 0,site_id,timestamp,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,set
12,13,0,2016-01-01 00:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
115,116,0,2016-01-01 01:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,24.4,,21.1,-1.0,1020.2,70.0,1.5,train
218,219,0,2016-01-01 02:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,22.8,2.0,21.1,0.0,1020.2,0.0,0.0,train
321,322,0,2016-01-01 03:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,21.1,2.0,20.6,0.0,1020.1,0.0,0.0,train
424,425,0,2016-01-01 04:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,train


In [8]:
scaled_mat = MinMaxScaler().fit_transform(df_b_m['meter_reading'].values.reshape(-1, 1))
scaled_mat

array([[0.        ],
       [0.        ],
       [0.        ],
       ...,
       [0.71990311],
       [0.72552118],
       [0.71187931]])

In [9]:
scaled_col = scaled_mat.reshape((len(scaled_mat),))
pd.Series(data = scaled_col, index = df_b_m.index).head()

12     0.0
115    0.0
218    0.0
321    0.0
424    0.0
dtype: float64

#### Testrun with smaller dataset

In [10]:
test = df.iloc[146446-10000:146446+10000]
print(test['meter'].unique())
print(test['building_id'].unique())
test.head()

[0 1]
[ 12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  30
  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  49
  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104   0   1   2   3   4   5   6   7   8   9  10  11  48  29]


Unnamed: 0.1,Unnamed: 0,site_id,timestamp,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,set
136446,136447,0,2016-02-25 08:00:00,12,0,0.0,Lodging/residential,37100,1999.0,,13.9,2.0,7.8,0.0,1016.2,280.0,6.2,train
136447,136448,0,2016-02-25 08:00:00,13,0,0.0,Education,99380,2000.0,,13.9,2.0,7.8,0.0,1016.2,280.0,6.2,train
136448,136449,0,2016-02-25 08:00:00,14,0,0.0,Education,86250,2013.0,,13.9,2.0,7.8,0.0,1016.2,280.0,6.2,train
136449,136450,0,2016-02-25 08:00:00,15,0,0.0,Office,83957,1974.0,,13.9,2.0,7.8,0.0,1016.2,280.0,6.2,train
136450,136451,0,2016-02-25 08:00:00,16,0,0.0,Education,54644,1996.0,,13.9,2.0,7.8,0.0,1016.2,280.0,6.2,train


In [11]:
# Applying scaling to small test dataset
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10, 90))

result = test.groupby(['meter', 'building_id']).apply(normalize_target, scaler=scaler)
result = result.reset_index()
result.columns = ['meter', 'building_id', 'index', 'meter_reading_scaled']
result = result.set_index('index')
result.head()

Unnamed: 0_level_0,meter,building_id,meter_reading_scaled
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
136537,0,0,0.0
136640,0,0,0.0
136743,0,0,0.0
136846,0,0,0.0
136949,0,0,0.0


In [12]:
# Add new column to small test dataset
test.loc[:,'meter_reading_scaled'] = result.loc[:,'meter_reading_scaled'].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Apply to small dataset

In [13]:
n = len(df)
frac = 1

data = df.iloc[:int(n*frac), :]
print(data.shape)
data.head()

(20216100, 18)


Unnamed: 0.1,Unnamed: 0,site_id,timestamp,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,set
0,1,0,2016-01-01 00:00:00,0,0,0.0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
1,2,0,2016-01-01 00:00:00,1,0,0.0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
2,3,0,2016-01-01 00:00:00,2,0,0.0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
3,4,0,2016-01-01 00:00:00,3,0,0.0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train
4,5,0,2016-01-01 00:00:00,4,0,0.0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train


In [17]:
scaler = MinMaxScaler() #with_centering=True, with_scaling=True, quantile_range=(10, 90)
scaled_data = get_normalized_data(data, scaler)
print('for {} rows'.format(len(data)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


get_normalized_data took 72113.61 milliseconds
for 20216100 rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [18]:
print('total rows are {}'.format(len(df)))

total rows are 20216100


In [19]:
print('time will be multiplied by approx. {:2f}'.format((len(df)/len(data))))

time will be multiplied by approx. 1.000000


In [21]:
scaled_data.head()

Unnamed: 0.1,Unnamed: 0,site_id,timestamp,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,set,meter_reading_scaled
0,1,0,2016-01-01 00:00:00,0,0,0.0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train,0.0
1,2,0,2016-01-01 00:00:00,1,0,0.0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train,0.0
2,3,0,2016-01-01 00:00:00,2,0,0.0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train,0.0
3,4,0,2016-01-01 00:00:00,3,0,0.0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train,0.0
4,5,0,2016-01-01 00:00:00,4,0,0.0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,train,0.0


In [20]:
if frac == 1:
    scaled_data.to_pickle('Scaled_Data.pkl')
else:
    scaled_data.to_pickle('Scaled_Data_'+str(frac)+'.pkl')