# SOEN 6611 - Step 5 - Implementation of Measurement Process

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#import data into dataframe

df = pd.read_csv('data_manipulated.csv')

#split data frame into three parts assuming each part is a separate time frame

df_t1 = df.iloc[:4000]
df_t2 = df.iloc[4000:7000]
df_t3 = df.iloc[7000:]

## Big Data Quality Indicators

### Global Variables

num_request - number of requests to database. 
<br>
num_successful_request - number of successful requests to database. <br>
**It is assumed that dataframe is equivalent to database and each call to dataframe is considered as request. If the call does not return any error then num_successful_requests is incremented.**

In [4]:
num_request = 0
num_successful_request = 0

### Base Measure Function

***

Method : recCount(df_list) : method to calculate total number of records in multiple datasets <br>
Param : df_list : list of multiple dataframes <br>
Return : Integer : returns total number of records in multiple datasets <br>

In [5]:
def recCount(df_list):
    global num_request
    global num_successful_request
    num_records = 0
    for i in df_list:
        num_request += 1
        num_records += i.shape[0]
        num_successful_request += 1
    
    return num_records

### Veracity

In [38]:
#Accuracy
def getAccuracy(df_list):
    num_records = recCount(df_list)
    global num_request
    global num_successful_request
    sum=0
    for i in df_list:
        num_request += 1
        if i.duplicated().sum() != 0:
            sum += (i.duplicated().sum() * np.log2(i.duplicated().sum()))
        num_successful_request += 1
    
    
    Hacc = np.log2(num_records) - (1/num_records) * sum
    Hmax = np.log2(num_records)
    
    return (Hacc / Hmax) 

#Completeness
def getCompleteness(df_list):
    num_records = recCount(df_list)
    global num_request
    global num_successful_request
    rec_null=0
    for i in df_list:
        num_request += 1
        rec_null += i.isnull().any(axis=1).sum()
        num_successful_request += 1
    
    return ((num_records - rec_null) / num_records)
    

#Currentness
def getCurrentness(df_list):
    num_records = recCount(df_list)
    
    combine_df = pd.concat([df_t1,df_t2,df_t3])
    combine_df = combine_df.dropna(subset=['Date'])
    combine_df['Date'] = pd.to_datetime(combine_df['Date'], format='%m/%d/%Y')
    combine_df = combine_df.sort_values(by='Date')
    n = combine_df.shape[0]
    lower = np.ceil((n+1)/4).astype('int64')-1
    higher = np.ceil((3/4) *(n+1)).astype('int64')-1
    date_lower = combine_df.iloc[lower]['Date']
    date_higher = combine_df.iloc[higher]['Date']
    count = combine_df.loc[combine_df['Date'].between(date_lower,date_higher,inclusive='both')].shape[0]
    
    return count / num_records

#Availability
def getAvailability():
    return num_successful_request / num_request

In [59]:
recCount([df_t1,df_t2,df_t3])

9828

In [39]:
df_t2.isnull().any(axis=1).sum()

34

In [61]:
getAccuracy([df_t1,df_t2,df_t3])

0.9997080550774292

In [62]:
getCompleteness([df_t1,df_t2,df_t3])

0.9930809930809931

In [63]:
getAvailability()

1.0

In [39]:
getCurrentness([df_t1,df_t2,df_t3])

0.4994912494912495

In [37]:
combine_df = pd.concat([df_t1,df_t2,df_t3])
combine_df = combine_df.dropna(subset=['Date'])
combine_df['Date'] = pd.to_datetime(combine_df['Date'], format='%m/%d/%Y')
combine_df = combine_df.sort_values(by='Date')
n = combine_df.shape[0]
lower = np.ceil((n+1)/4).astype('int64')-1
higher = np.ceil((3/4) *(n+1)).astype('int64')-1
date_lower = combine_df.iloc[lower]['Date']
date_higher = combine_df.iloc[higher]['Date']
print(date_lower)
print(date_higher)
count = combine_df.loc[combine_df['Date'].between(date_lower,date_higher,inclusive='both')].shape[0]
print(count)

2013-02-19 00:00:00
2019-05-28 00:00:00
4909


### Vincularity

### Validity