In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## ignore useless warnings
import warnings
warnings.filterwarnings(action='ignore')
pd.options.display.max_seq_items=8000
pd.options.display.max_rows = 8000

## Import the dataset

In [None]:
# Loading the train set
train = pd.read_csv('/content/drive/MyDrive/energy_dataset/train.csv')
building = pd.read_csv('/content/drive/MyDrive/energy_dataset/building_metadata.csv')
weather_train = pd.read_csv('/content/drive/MyDrive/energy_dataset/weather_train.csv')


In [None]:
# Loading test set
test_data = pd.read_csv('/content/drive/MyDrive/energy_dataset/test.csv') 
weather_test_data = pd.read_csv('/content/drive/MyDrive/energy_dataset/weather_test.csv') 

In [None]:
# Shapes of the dataframes
print('Shape of train data:',train.shape)
print('Shape of weather train data:', weather_train.shape)
print('Shape of building meta data:', building.shape)

Shape of train data: (20216100, 4)
Shape of weather train data: (139773, 9)
Shape of building meta data: (1449, 6)


In [None]:
#https://www.kaggle.com/kernels/scriptcontent/3684066/download
# WE MAY USE THIS FUNCTION TO REDUCE MEMORY USEAGE

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    #iterating over every column and finding the type of the column
    for col in df.columns:
      if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
      col_type = df[col].dtype
        
        #If the columns is not object
      if col_type != object:
          #Get the minimum and maximum value
            c_min = df[col].min()
            c_max = df[col].max()
            #If the type is int
            if str(col_type)[:3] == 'int':
              #If the min max values lies with thin the range of int8 type then assign the type as int8
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
              #If the min max values lies with thin the range of int16 type then assign the type as int16 
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
              #If the min max values lies with thin the range of int32 type then assign the type as int32
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
              #If the min max values lies with thin the range of int64 type then assign the type as int64
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
              #If the min max values lies with thin the range of float16 type then assign the type as float16
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
              #If the min max values lies with thin the range of float32 type then assign the type as float32
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
      else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Memory reduce
train = reduce_mem_usage(train)
building = reduce_mem_usage(building)
weather_train=reduce_mem_usage(weather_train)

test = reduce_mem_usage(test)
weather_test = reduce_mem_usage(weather_test)

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.84 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.9%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.59 MB
Decreased by 73.1%


In [None]:
test_data = reduce_mem_usage(test_data, 'Test Data')
weather_test_data = reduce_mem_usage(weather_test_data, 'Weather Test Data')

## Combining the datasets

In [None]:
#We will combine the all the data for train set in a single dataframe and test set in another dataframe
# Merging train set
train_df = train.merge(building, on='building_id', how='left')
train_df = train_df.merge(weather_train, on=['site_id', 'timestamp'], how='left')

In [None]:
# Merging test set
test_df = test_data.merge(building, on='building_id', how='left')
test_df = test_df.merge(weather_test_data, on=['site_id', 'timestamp'], how='left')

## Correct the unit

In [None]:
# the electric meter readings for site 0 were not properly converted to units of kWh and are in kBTU (over 900k rows)
# Multiply by 0.2931 to get to model inputs into kWh like the other sites, and 3.4118 to get back to kBTU for scoring.
# DO NOT RUN THIS CELL TWICE!
train_df.loc[(train_df['site_id'] == 0) & (train_df['meter'] == 0), 'meter_reading'] = train_df[(train_df['site_id'] == 0) & (train_df['meter'] == 0)]['meter_reading'] * 0.2931

## Break the datatime into day, month

In [None]:
# We will break the timestamp into hour of the day, day of week, month and the year.
# Maybe the timestamp column should be drop during training stage.

def break_datetime(df):
  df['timestamp']= pd.to_datetime(df['timestamp'])
  df['hour']= np.uint8(df['timestamp'].dt.hour)
  df['dayofweek']= np.uint8(df['timestamp'].dt.dayofweek)
  df['month']= np.uint8(df['timestamp'].dt.month)
  df['dayofyear']= np.uint16(df['timestamp'].dt.dayofyear)
  df['day']= np.uint16(df['timestamp'].dt.day) #day of month
  df['year']= np.uint16(df['timestamp'].dt.year)
  return df

In [None]:
train_df = break_datetime(train_df)

In [None]:
test_df = break_datetime(test_df)

## **Pipeline**


In [None]:
test = test.drop(["row_id"],axis = 1)

In [None]:
# ###  PIPELINE CODE####

from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer


# # transformer for categorical features - primary_use, season, Building_id, meter, site_id
categorical_features = ['building_id', 'meter']

categorical_transformer = imbPipeline(
   [        
     ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
   ]
)

# # transformer for ordinal categorical features 
categorical_features_lab = [  'hour', 'dayofweek', 'dayofyear',
                       'isDayTime']          

categorical_transformer_lab = imbPipeline(
   [        
        #('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('label', OrdinalEncoder())
   ]
)    

# # transformer for numerical features - Timestamp not included so will be dropped
numeric_features = [ 'square_feet', 'air_temperature'] 
     
numeric_transformer = imbPipeline(
   [
       # ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())   # standard scaling of the numerical features
   ]
)

# # combine them in a single ColumnTransformer
preprocessor = ColumnTransformer(
   [
       ('categoricals', categorical_transformer, categorical_features),
       ('labels', categorical_transformer_lab, categorical_features_lab),
       ('numericals', numeric_transformer, numeric_features)], 
       remainder = 'drop'         # Ensures that the features not specifically named are dropped
)

In [None]:
train_fit = imbPipeline(
    [
     ('preprocessing',preprocessor)
    ]
)
train_fit.fit(train_df)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['building_id', 'meter',
                                                   'site_id', 'primary_use',
                                                   'season']),
                                                 ('labels',
                                                  Pipeline(steps=[('label',
                                                                   OrdinalEncoder())]),
                                                  ['cloud_coverage',
                                                   'wind_direction', 'hour',
                                                   'dayofweek', 'month',
                                                   'day

In [None]:
test_fit = imbPipeline(
    [
     ('preprocessing',preprocessor)
    ]
)
test_fit.fit(test)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categoricals',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['building_id', 'meter']),
                                                 ('labels',
                                                  Pipeline(steps=[('label',
                                                                   OrdinalEncoder())]),
                                                  ['hour', 'dayofweek',
                                                   'dayofyear', 'isDayTime']),
                                                 ('numericals',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                

## **Preprocessing for Supervised Learning Models**


In [None]:
#Alternative for filling data- unsure of the features used and the use of mean- discuss in meeting
def nan_fillers(df):
   air_temp_df=df.groupby(['site_id', 'day', 'month'])['air_temperature'].transform('mean')
   df['air_temperature'].fillna(air_temp_df, inplace=True)

   dew_temp_df=df.groupby(['site_id', 'day', 'month'])['dew_temperature'].transform('mean')
   df['dew_temperature'].fillna(dew_temp_df, inplace=True)

   #cloud_df=df.groupby(['site_id', 'day', 'month'])['cloud_coverage'].transform('median')
   #df['cloud_coverage'].fillna(cloud_df, inplace=True)

   #sea_level_df=df.groupby(['site_id', 'day', 'month'])['sea_level_pressure'].transform('median')
   #df['sea_level_pressure'].fillna(sea_level_df, inplace=True)

   #precip_df=df.groupby(['site_id', 'day', 'month'])['precip_depth_1_hr'].transform('median')
   #
   #df['precip_depth_1_hr'].fillna(precip_df, inplace=True)

   wind_dir_df=df.groupby(['site_id', 'day', 'month'])['wind_direction'].transform('mean')
   df['wind_direction'].fillna(wind_dir_df, inplace=True)

   wind_speed_df=df.groupby(['site_id', 'day', 'month'])['wind_speed'].transform('mean')
   df['wind_speed'].fillna(wind_speed_df, inplace=True)


   return df

train_df = nan_fillers(train_df) # this does not deal with the three features listed below
train_df['cloud_coverage'].fillna(train_df['cloud_coverage'].median(), inplace=True)
train_df['precip_depth_1_hr'].fillna(train_df['precip_depth_1_hr'].median(), inplace=True)
train_df['sea_level_pressure'].fillna(train_df['sea_level_pressure'].median(), inplace=True)
train_df.isnull().sum()

building_id           0
meter                 0
timestamp             0
meter_reading         0
site_id               0
primary_use           0
square_feet           0
air_temperature       0
cloud_coverage        0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
hour                  0
dayofweek             0
month                 0
dayofyear             0
day                   0
meter_reading_log     0
season                0
isDayTime             0
city                  0
country               0
dtype: int64

In [None]:
test_df = nan_fillers(test_df) # this does not deal with the three features listed below
test_df['cloud_coverage'].fillna(test_df['cloud_coverage'].median(), inplace=True)
test_df['precip_depth_1_hr'].fillna(test_df['precip_depth_1_hr'].median(), inplace=True)
test_df['sea_level_pressure'].fillna(test_df['sea_level_pressure'].median(), inplace=True)
test.isnull().sum()

In [None]:
# Encode categorical data
categorical_features = ['primary_use', 'season']
encoder = preprocessing.LabelEncoder()

for i in categorical_features:
  train[i] = encoder.fit_transform(train[i])
  test[i] = encoder.fit_transform(test[i])

In [None]:
# Remove timestamp
train = train.drop(['timestamp'], axis = 1)
test = test.drop(['timestamp'], axis = 1)

In [None]:
# Reduce memory usage (again)
reduced_train_data = reduce_mem_usage(train_df, 'Train Data')
reduced_test_data = reduce_mem_usage(test_df, 'Test Data')

In [None]:
# Remove least important features generated by feature selection
train_data = reduced_train_data[['building_id','square_feet','meter','air_temperature','dayofyear','hour','isDayTime','dayofweek', 'meter_reading']]
test = reduced_test_data[['building_id','square_feet','meter','air_temperature','dayofyear','hour','isDayTime','dayofweek']]

In [None]:
from sklearn.model_selection  import train_test_split
from sklearn.pipeline import Pipeline

x_train, x_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state= 45)

##KNN Implementation

###KNN Hyperparameter

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

test_accuracy=[]
n_estimator = range(10,50,10)
clock = 0
for n in n_estimator:
  clock = clock+1
  KNN_test = Pipeline(
      [('preprocessing',preprocessor),
       ('classifier', KNeighborsClassifier(n_neighbors= n))
       ]   
  )
  KNN_test.fit(x_train, y_train)
  print(clock)
  y_hat = KNN_test.predict(x_val)
  test_accuracy.append(accuracy_score(y_val,y_hat))
index_n = test_accuracy.index(max(test_accuracy))
optimum_n = n_estimator[index_n]
optimum_n
#optimum_n = 10

1


10

###KNN Regression Model

In [None]:
from sklearn.neighbors import KNeighborsRegressor
KNN_clf = Pipeline(
    [
     ('classifier', KNeighborsRegressor(n_neighbors=10))
    ]
)
KNN_clf.fit(x_train, y_train)


Pipeline(steps=[('classifier', KNeighborsRegressor(n_neighbors=10))])

In [None]:
knnPred = []

In [None]:
steps = np.arange(0,len(test),10000)


In [None]:
for i in steps:
  test_samp = test[i:i+10000]
  knnPred.append(KNN_clf.predict(test_samp))

In [None]:
knnPredArray = np.concatenate((knnPred),axis=0)

In [None]:
KNN_preddf = pd.DataFrame(knnPredArray, columns = ["meter_reading"])
KNN_preddf.insert(0, "row_id",test["index"])

In [None]:
KNN_preddf = KNN_preddf.astype({'row_id':'int32'})
KNN_preddf = KNN_preddf.astype({'meter_reading':'float32'})

In [None]:
KNN_preddf

Unnamed: 0,row_id,meter_reading
0,0,3.750000
1,1,3.750000
2,2,3.750000
3,3,3.750000
4,4,3.750000
...,...,...
41697595,41697595,2.421875
41697596,41697596,2.421875
41697597,41697597,2.421875
41697598,41697598,2.421875


Kaggle Score =  3.416

##Decision Tree Classifier

###DTree Hyperparameter

In [None]:
from sklearn.tree import DecisionTreeClassifier

test_accuracy=[]
n_estimator = range(80,150,10)

for n in n_estimator:
  tree_clf = Pipeline(
      [('preprocessing',preprocessor),
       ('classifier', DecisionTreeClassifier(max_depth=n, random_state=10))
       ]   
  )
  tree_clf.fit(x_train, y_train)
  y_hat = tree_clf.predict(x_val)
  test_accuracy.append(accuracy_score(y_val,y_hat))
index_n = test_accuracy.index(max(test_accuracy))
optimum_n = n_estimator[index_n]
optimum_n
#max_depth = 14 random_state = 10 optimal is 110 but this is not reasonable within the software limitations

110

###Decision Regression Model

In [None]:
from sklearn.tree import DecisionTreeRegressor
#start_time = time.time()
tree_clf = Pipeline(
    [
     ('classifier', DecisionTreeRegressor(max_depth=14, random_state=10))
    ]
)

tree_clf.fit(x_train, y_train)
#print(" %s seconds " % (time.time() - start_time))
#Eager Learner

Pipeline(steps=[('classifier',
                 DecisionTreeRegressor(max_depth=14, random_state=10))])

###prediction

In [None]:
tree_pred = tree_clf.predict(test)

In [None]:
arrayToList = tree_pred.tolist()

In [None]:
tree_pred_df = pd.DataFrame()  

In [None]:
tree_pred_df['meter_reading'] = arrayToList
tree_pred_df.insert(0, "row_id",test["index"])

In [None]:
tree_pred_df = tree_pred_df.astype({'row_id':'int32'})
tree_pred_df = tree_pred_df.astype({'meter_reading':'float32'})

Kaggle Score = 3.021