In [None]:
import pandas as pd
import datetime
import numpy as np
import joblib as jl
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller,kpss
import os
import warnings
warnings.filterwarnings('ignore')



""" FUNCTION TO FEATURE ENGINEER DATA COMING FROM SOURCE-2 """

def src2(x):
    
    source2 = pd.read_csv(x)
    source2['date'] = [i[0:10] for i in source2['datetime']]
    
    source2_temp = source2[['value','datetime','date']][source2['sensor'] == 'TC']
    source2_temp.columns = ['temp','datetime','date']
    
    source2_pres = source2[['value','datetime','date']][source2['sensor'] == 'PRES']
    source2_pres.columns = ['pres','datetime','date']
    
    source2_hum = source2[['value','datetime','date']][source2['sensor'] == 'HUM']
    source2_hum.columns = ['hum','datetime','date']
    
    source2 = pd.merge(source2_temp, pd.merge(source2_pres, source2_hum, on = 'datetime', how = 'inner'), on = 'datetime', how = 'inner')
    
    del source2_temp
    del source2_pres
    del source2_hum
    
    source2 = source2[['temp','pres','hum','datetime','date']]
    source2['hour'] = [source2['datetime'][i][11:13] for i in range(len(source2))]
    
    for i in range(len(source2)):
        source2['datetime'][i] = source2['datetime'][i][0:19]
    
    source2['datete'] = [pd.to_datetime(z) for z in source2['datetime']]
    return source2




""" FUNCTION TO PRE-PROCESS THE FEATURE ENGINEERED DATA COMING FROM SOURCE-2 """

def prep_src2(y):
    df2 = (y.groupby(['date']).apply(lambda x : 24-len(x)).reset_index(name = 'missing_entries'))
    df2_req = df2[df2['missing_entries'] != 0].reset_index(drop = True)
    
    del df2
    
    sd = list(y['hour'].dropna().unique())
    za = datetime.timedelta(days = 1)
    
    for i in range(len(df2_req)):
        o = df2_req['date'].iloc[i]
        reqs = y[(y['date'] == o)] 
        sd2 = [yo for yo in sd if yo not in list(reqs['hour'])]
        
        for l in sd2:
            dt = reqs['datetime'].iloc[0][0:10]+' '+l+reqs['datetime'].iloc[0][13:]
            dte = reqs['date'].iloc[0]
            p = pd.to_datetime(dt)
            tmp = (float(y['temp'][y['datete'] == p-za]) + float(y['temp'][y['datete'] == p+za]))/2
            prs = (float(y['pres'][y['datete'] == p-za]) + float(y['pres'][y['datete'] == p+za]))/2
            hm = (float(y['hum'][y['datete'] == p-za]) + float(y['hum'][y['datete'] == p+za]))/2
            
            cdf = pd.DataFrame({'datetime':dt,
                           'date':dte,'datete':p,'temp':tmp,
                           'pres':prs,
                           'hum':hm,
                           'hour':l,},
                           index = [len(y)])
            y = y.append(cdf)
            
    y = y.sort_values(by = ['datetime'])
    y = y.reset_index(drop = True)
    
    del cdf
    del reqs
    del sd2
    del sd
    
    source2_fin = pd.DataFrame(columns = ['temp','pres','hum'])
    source2_fin['temp'] = [(y['temp'][i] + y['temp'][i+1])/2 for i in range(len(y)-1)]
    source2_fin['pres'] = [(y['pres'][i] + y['pres'][i+1])/2 for i in range(len(y)-1)]
    source2_fin['hum'] = [(y['hum'][i] + y['hum'][i+1])/2 for i in range(len(y)-1)]
    source2_fin['datetime'] = [y['datetime'][i][0:13]+y['datetime'][i][16:] for i in range(1,len(y))]
    source2_fin['date'] = [y['date'][i] for i in range(1,len(y))]
    
    
    source2_fin = source2_fin.append(y[['temp','pres','hum','datetime','date']][0:1], ignore_index= True)
    source2_fin['datetime'][-1:] = source2_fin['date'][-1:]+' 00:00'
    
    del df2_req
    
    source2_fin['datetime'] = [pd.to_datetime(i) for i in source2_fin['datetime']]
    source2_fin = source2_fin.sort_values(by = ['datetime'])
    source2_fin = source2_fin.set_index('datetime')
    source2_fin = source2_fin[['temp','pres','hum']]
    source2_fin.columns = ['src2_temp', 'src2_pres', 'src2_hum']
    return source2_fin




""" FUNCTION TO FEATURE ENGINEER THE DATA COMING FROM IOT SENSOR """

def iott(a):
    
    iot = pd.read_csv(a)
    
    iot_hum = iot[iot['sensor'] == 'HUM']
    iot_hum = iot_hum[['value','datetime']]
    iot_hum.columns = ['hum','datetime']
    
    iot_temp = iot[iot['sensor'] == 'TC']
    iot_temp = iot_temp[['value','datetime']]
    iot_temp.columns = ['temp','datetime']
    
    iot_pres = iot[iot['sensor'] == 'PRES']
    iot_pres = iot_pres[['value','datetime']]
    iot_pres.columns = ['pres','datetime']
    
    iot_fin = pd.merge(iot_temp, pd.merge(iot_pres, iot_hum, on = 'datetime', how = 'inner'), on = 'datetime', how = 'inner')
    
    del iot_hum
    del iot_pres
    del iot_temp
    
    iot_fin['date'] = [iot_fin['datetime'][i][0:10] for i in range(len(iot_fin))]
    iot_fin['hour'] = [iot_fin['datetime'][i][11:13] for i in range(len(iot_fin))]
    iot_fin['datetime'] = [iot_fin['datetime'][i][0:14]+'00' for i in range(len(iot_fin))]
    
    return iot_fin





""" FUNCTION TO PRE-PROCESS THE FEATURE ENGINEERED DATA COMING FROM IOT SENSOR"""

def prep_iot(b):
    
    df2_iot = (b.groupby(['date']).apply(lambda x : 24-len(x)).reset_index(name = 'missing_entries'))
    df2_iot_req_neg = df2_iot[df2_iot['missing_entries']<0]
    df2_iot_req_neg = df2_iot_req_neg.reset_index(drop = True)
    
    del df2_iot
    
    for i in range(len(df2_iot_req_neg)):
        cd = df2_iot_req_neg['date'].iloc[i] + ' 00:00'
        b = b[b['date'] != df2_iot_req_neg['date'].iloc[i]]
        fdc = pd.DataFrame({'datetime':cd,
                           'date':df2_iot_req_neg['date'].iloc[i],'temp':'',
                           'pres':'',
                           'hum':'',
                           'hour':'00',},
                           index = [len(b)])
        b = b.append(fdc)
        
    del df2_iot_req_neg
    del fdc
    
    
    df2_iot = (b.groupby(['date']).apply(lambda x : 24-len(x)).reset_index(name = 'missing_entries'))
    df2_iot_req_pos = df2_iot[df2_iot['missing_entries']>0]
    df2_iot_req_pos = df2_iot_req_pos.reset_index(drop = True)
    
    del df2_iot
    
    sd = list(b['hour'].dropna().unique())
    for i in range(len(df2_iot_req_pos)):
        o = df2_iot_req_pos['date'].iloc[i]
        reqs = b[(b['date'] == o)]
    
        sd2 = [yo for yo in sd if yo not in list(reqs['hour'])]
        
        for l in sd2:
            dt = reqs['datetime'].iloc[0][0:10]+' '+l+reqs['datetime'].iloc[0][13:]
            dte = reqs['date'].iloc[0]
            tmp = ''
            prs = ''
            hm = ''
            fdc1 = pd.DataFrame({'datetime':dt,
                               'date':dte,
                               'temp':tmp,
                               'pres':prs,
                               'hum':hm},
                               index = [len(b)])
            b = b.append(fdc1)
        
    b = b.sort_values(by = ['datetime'])
    b = b.reset_index(drop = True)
    
    del fdc1
    del reqs
    del sd2
    del sd
    del df2_iot_req_pos
    
    
    b = b.replace(r'^\s*$', np.nan, regex=True)
    
    b = b[['temp','pres','hum','datetime']]
    b['datetime'] = [pd.to_datetime(b['datetime'][i]) for i in range(len(b))]
    b = b.set_index('datetime')
    
    iot_fin = b.interpolate(method='spline', order=2)
    iot_fin.columns = ['iot_temp','iot_pres','iot_hum']
    return iot_fin





""" FUNCTION TO MERGE THE SOURCE2 AND IOT SENSOR DATA """

def merge_iotsrc(q,w):
    return q.join(w)



""" FUNCTION TO EXTRACT THE REQUIRED DATAPOINTS THAT WOULD AID IN RECONSTRUCTING THE ORIGINAL DATASET FROM DIFFERENCED DATASET """

def convert_req(fin_df):
    revert = fin_df.reset_index(drop = False).iloc[0:2]
    revert.columns = ['datetime','src2_temp_d','src2_pres_d','src2_hum_d','iot_temp_d','iot_pres_d','iot_hum_d']
    return revert



""" FUNCTION TO DOUBLE DIFFERENCE THE DATA TO REMOVE NON-STATIONARITY """

def diffrnc(req_dff):
    df_diff =pd.DataFrame()
    for col in req_dff.columns:
        if col != 'datetime':
            df_diff[col+'_d'] = (req_dff[col]).diff().diff()
    df_diff = df_diff.dropna()
    return df_diff



""" FUNCTION TO FIT THE MODEL ON THE DOUBLE DIFFERENCED DATA AND SAVE IT IN THE GIVEN DIRECTORY """

def fitting(df_diff):
    model_fin = VAR(df_diff)
    model_fin_fitted = model_fin.fit(24)
    jl.dump(model_fin_fitted, 'my_predictor.sav')
    
    
    
if __name__ == "__main__":

    directory = input('Enter the folder path where your train data is located - This is also the folder where the model and supplementary data will be stored: \n')
    os.chdir(directory)
    print('\n','directory successfully changed!')
    
    abc = prep_src2(src2('plot2-Source 2 Weather.csv'))
    efg = prep_iot(iott('plot2-IoT Sensor Data.csv'))
    fin_df = merge_iotsrc(abc,efg)
    
    del abc
    del efg
    
    revert = convert_req(fin_df)
    revert.to_csv('final_two.csv',index = False)
    
    del revert
    
    df_diff = diffrnc(fin_df)
    
    del fin_df
    
    df_diff = df_diff.reset_index(drop = False)
    df_diff.to_csv('double_differenced.csv', index = False)
    df_diff = df_diff.set_index('datetime')
    
    fitting(df_diff)
    
    del df_diff
    
    print('\n', 'MODEL HAS BEEN TRAINED AND SAVED INTO THE PROVIDED DIRECTORY! PLEASE FEEL FREE TO RUN THE Predictor NOTEBOOK :)' )

Enter the folder path where your train data is located - This is also the folder where the model and supplementary data will be stored: 
C:\Users\sidew\Desktop\Fasal\air\Assignemnt Submission

 directory successfully changed!


In [2]:
fin_df = 2