In [1]:
import pandas as pd 
import numpy as np 


df_build_meta = pd.read_csv('./data/building_metadata.csv')
df_test = pd.read_csv('./data/test.csv')
df_train = pd.read_csv('./data/train.csv')
df_weather_test = pd.read_csv('./data/weather_test.csv')
df_weather_train = pd.read_csv('./data/weather_train.csv')

# Spis cech dla train.csv:  
# - building_id - klucz obcy dla ewentualnego mergowania 
# - Meter - zużycie ze względu na { 0 : chłodna woda , 1 : elektryczność , 2 : gaz , 3 : gorąca woda } 
# - Timestamp - czas dokonania pomiaru (choć nie jestem pewny) 
# - meter_reading - Target
# Dane zostały zebrane w realnych warunkach dlatego możliwe jest występowanie błędów pomiarowych które mają wpływ na model 
# Spis dla building_meta.csv: 
# - site_id 
# - building_id - Klucz obcy dla mergowania z train.csv
# - primary_use - Główna determinanta zużycia w danym budynku ( na jakie czynności pochłaniane jest najwięcej energii) 
# - Square_feet - Powierzchnia budynku 
# - year_built - Rok budowy budynku 
# - floor_count - liczba pięter budynków ( Jest NaN) 
# Spis cech dla weather_train/test: 
# - site_id 
# - Timestamp - Jeśli ta cecha definiuje kiedy został dokonany pomiar to czas jest tutaj przesunięty o rok 
# - air_temperature - temperatura powietrza ( Celsjusze)
# - cloud_coverage - Wskaźnik zakrycia chmurami, oktanowa : 0-8 
# - dew_temperature - temperatura rosy ? 
# - precip_depth_1h - opady atmosferyczne mierzone co godzine 
# - sea_level_pressure - ciśnienie w hPa
# - wind_direction - kierunek wiatru 
# - wind_speed - predkość wiatru w m/s 
# Spis dla test.csv: 
# - row_id - wiersz do podsumowania 
# - building_id - Klucz obcy 
# - meter - target 
# - timestamp - czas 

In [6]:
# Reduce memory usage using kernel from https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

train, NAlist = reduce_mem_usage(df_train) # This dataframe has not any Nan Values

Memory usage of properties dataframe is : 289.1937065124512  MB
******************************
Column:  building_id
dtype before:  uint16
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  uint8
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  meter_reading
dtype before:  float32
min for this col:  0.0
max for this col:  21904700.0
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  289.1937065124512  MB
This is  100.0 % of the initial size


In [None]:
# Merge data 
# Memory problem - work in progress 
train_data = train.merge(train,left_on =['building_id'],right_on =['building_id'],how='left')
train_data = train.merge(df_weather_train,on=['site_id','timestamp'],how='left')