In [1]:
!pip install dvc dvc[gdrive]

Collecting dvc
  Downloading dvc-2.7.2-py3-none-any.whl (666 kB)
[K     |████████████████████████████████| 666 kB 6.6 MB/s 
[?25hCollecting psutil>=5.8.0
  Downloading psutil-5.8.0-cp37-cp37m-manylinux2010_x86_64.whl (296 kB)
[K     |████████████████████████████████| 296 kB 60.7 MB/s 
[?25hCollecting dictdiffer>=0.8.1
  Downloading dictdiffer-0.9.0-py2.py3-none-any.whl (16 kB)
Collecting aiohttp-retry==2.4.5
  Downloading aiohttp_retry-2.4.5-py3-none-any.whl (7.5 kB)
Collecting pathspec<0.9.0,>=0.6.0
  Downloading pathspec-0.8.1-py2.py3-none-any.whl (28 kB)
Collecting grandalf==0.6
  Downloading grandalf-0.6-py3-none-any.whl (31 kB)
Collecting rich>=10.0.0
  Downloading rich-10.9.0-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 34.3 MB/s 
[?25hCollecting distro>=1.3.0
  Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)
Collecting gitpython>3
  Downloading GitPython-3.1.18-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 5

In [19]:
# Data Version Control
import os
import dvc.api
from io import StringIO,BytesIO
# Other
import warnings
warnings.filterwarnings('ignore')
# Your dependencies ... 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

%matplotlib inline

In [2]:
# Data Version Control
import os
import dvc.api
from io import StringIO,BytesIO

# Your dependencies ... 
import pandas as pd

In [3]:
def get_dvc_file(file_path: str, github_tag='v1.1_Gdrive_remote', 
                 pat_path='.github_pat', repo='Explore-AI/EXPLORE.Utilities.Epl.Core.git'):
  """Utility function to convert a file path into DVC-based
     file stream for the underlying data. 

    Args:
        file_path (str): Path to the data file. Should be relative to the 
                         base of the Git repository. 
        github_tag (str): The GitHub tag associated with the version 
                          of the data being used.
        pat_path (str): Path to a file containing your GitHub personal
                        access token (PAT).
        repo (str): The GitHub repo associated with DVC where the data is 
                    being tracked. 
  """
  # Retrieve GitHub secrets file
  try:
    with open(pat_path, "r") as f: 
      pat_secret = f.readline()
  except: 
    raise FileNotFoundError(f"The given path to your GitHub PAT file: '{pat_path}', doesn't exist")
  
  # Choose reading method based-on input file type
  file_extension = file_path.split('.')[-1]

  if file_extension in ['csv']:
    read_mode = 'r'
    reader = lambda x: StringIO(x)
  else:
    read_mode = 'rb'
    reader = lambda x: BytesIO(x)

  file_stream = dvc.api.read(
      path=file_path,
      repo=f'https://{pat_secret}@github.com/{repo}',
      rev=github_tag,
      mode=read_mode
  )
  file_stream = reader(file_stream)
  return file_stream

In [9]:
#Filtered raw data
raw_filtered = pd.read_csv(get_dvc_file('enrich/water/synthetic_data/validation/data/filtered_raw.csv','v1.3_raw_filter_data'))

In [10]:
raw_filtered.isnull().sum()

timestamp         0
tag_name          0
value        130167
dtype: int64

In [11]:
def fill_empty(df, column):
    """
    This function replaces empty and null values using the forward fill method
    """
    changed = df[column].isnull().sum()
    df[column].fillna((df[column].mean()), inplace=True)
    
    return 'Number of entries filled: '+str(changed)

In [12]:
fill_empty(raw_filtered, 'value')

'Number of entries filled: 130167'

In [13]:
raw_filtered.isnull().sum()

timestamp    0
tag_name     0
value        0
dtype: int64

In [14]:
raw_filtered.head()

Unnamed: 0,timestamp,tag_name,value
0,2021-05-31T23:45:00.000Z,DLM:14336-FLOW,30.410847
1,2021-06-01T00:00:00.000Z,DLM:14336-FLOW,29.162193
2,2021-06-01T00:15:00.000Z,DLM:14336-FLOW,28.67884
3,2021-06-01T00:30:00.000Z,DLM:14336-FLOW,27.416758
4,2021-06-01T00:45:00.000Z,DLM:14336-FLOW,26.906553


In [15]:
raw_filtered.set_index('timestamp', inplace=True)

In [16]:
raw_filtered.head()

Unnamed: 0_level_0,tag_name,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-05-31T23:45:00.000Z,DLM:14336-FLOW,30.410847
2021-06-01T00:00:00.000Z,DLM:14336-FLOW,29.162193
2021-06-01T00:15:00.000Z,DLM:14336-FLOW,28.67884
2021-06-01T00:30:00.000Z,DLM:14336-FLOW,27.416758
2021-06-01T00:45:00.000Z,DLM:14336-FLOW,26.906553


In [17]:
raw_filtered.describe()

Unnamed: 0,value
count,2525889.0
mean,12.20961
std,19.70281
min,-142.5432
25%,0.3
50%,5.729991
75%,17.35883
max,282.1402
