# SOEN 6611 - Step 5 - Implementation of Measurement Process

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [39]:
#import data into dataframe

df = pd.read_csv('data_manipulated.csv')

#split data frame into three parts assuming each part is a separate time frame

df_t1 = df.iloc[:4000]
df_t2 = df.iloc[4000:7000]
df_t3 = df.iloc[7000:]

In [40]:
df.head(5)

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,...,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target,Date,PCI_compliant
0,5008804,1.0,1.0,1.0,1.0,0.0,0.0,0,0.0,2.0,...,32.868574,12.435574,Working,Higher education,Civil marriage,Rented apartment,Other,1.0,12/31/2011,Yes
1,5008806,1.0,1.0,1.0,0.0,0.0,0.0,0,0.0,2.0,...,58.793815,3.104787,Working,Secondary / secondary special,Married,House / apartment,Security staff,0.0,3/28/2012,Yes
2,5008808,0.0,0.0,1.0,0.0,1.0,1.0,0,0.0,1.0,...,52.321403,8.353354,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,0.0,10-11-2012,Yes
3,5008812,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1.0,...,61.504343,0.0,Pensioner,Higher education,Separated,House / apartment,Other,0.0,9/20/2013,Yes
4,5008815,1.0,1.0,1.0,1.0,1.0,1.0,0,0.0,2.0,...,46.193967,2.10545,Working,Higher education,Married,House / apartment,Accountants,0.0,10/15/2015,Yes


In [41]:
print(df_t1.shape[0])
print(df_t2.shape[0])
print(df_t3.shape[0])

4000
3000
2828


## Big Data Quality Indicators

### Global Variables

num_request - number of requests to database. 
<br>
num_successful_request - number of successful requests to database. <br>
**It is assumed that dataframe is equivalent to database and each call to dataframe is considered as request. If the call does not return any error then num_successful_requests is incremented.**

In [35]:
num_request = 0
num_successful_request = 0

### Base Measure Function

***

Method : recCount(df_list) : method to calculate total number of records in multiple datasets <br>
Param : df_list : list of multiple dataframes <br>
Return : Integer : returns total number of records in multiple datasets <br>

In [36]:
def recCount(df_list):
    global num_request
    global num_successful_request
    num_records = 0
    for i in df_list:
        num_request += 1
        num_records += i.shape[0]
        num_successful_request += 1
    
    return num_records

In [37]:
recCount([df_t1,df_t2,df_t3])

9828

### Veracity

In [38]:
#Accuracy
def getAccuracy(df_list):
    num_records = recCount(df_list)
    global num_request
    global num_successful_request
    sum=0
    for i in df_list:
        num_request += 1
        print(i.duplicated().sum())
        # In case of duplicates elements
        if i.duplicated().sum() != 0:
            sum += (i.duplicated().sum() * np.log2(i.duplicated().sum()))
        num_successful_request += 1
    
    
    Hacc = np.log2(num_records) - (1/num_records) * sum
    Hmax = np.log2(num_records)
    
    return (Hacc / Hmax) 

#Completeness
def getCompleteness(df_list):
    num_records = recCount(df_list)
    global num_request
    global num_successful_request
    rec_null=0
    for i in df_list:
        num_request += 1
        rec_null += i.isnull().any(axis=1).sum()
        num_successful_request += 1
    
    return ((num_records - rec_null) / num_records)
    

#Currentness
def getCurrentness(df_list):
    num_records = recCount(df_list)
    
    combine_df = pd.concat([df_t1,df_t2,df_t3])
    combine_df = combine_df.dropna(subset=['Date'])
    combine_df['Date'] = pd.to_datetime(combine_df['Date'], format='%m/%d/%Y')
    combine_df = combine_df.sort_values(by='Date')
    n = combine_df.shape[0]
    lower = np.ceil((n+1)/4).astype('int64')-1
    higher = np.ceil((3/4) *(n+1)).astype('int64')-1
    date_lower = combine_df.iloc[lower]['Date']
    date_higher = combine_df.iloc[higher]['Date']
    count = combine_df.loc[combine_df['Date'].between(date_lower,date_higher,inclusive='both')].shape[0]
    
    return count / num_records

#Availability
def getAvailability():
    return num_successful_request / num_request

In [42]:
recCount([df_t1,df_t2,df_t3])

9828

In [43]:
df_t2.isnull().any(axis=1).sum()

34

In [44]:
getAccuracy([df_t1,df_t2,df_t3])

11
0
0


0.9997080550774292

In [45]:
getCompleteness([df_t1,df_t2,df_t3])

0.9930809930809931

In [15]:
getAvailability()

1.0

In [16]:
getCurrentness([df_t1,df_t2,df_t3])

0.4994912494912495

In [17]:
combine_df = pd.concat([df_t1,df_t2,df_t3])
combine_df = combine_df.dropna(subset=['Date'])
combine_df['Date'] = pd.to_datetime(combine_df['Date'], format='%m/%d/%Y')
combine_df = combine_df.sort_values(by='Date')
n = combine_df.shape[0]
lower = np.ceil((n+1)/4).astype('int64')-1
higher = np.ceil((3/4) *(n+1)).astype('int64')-1
date_lower = combine_df.iloc[lower]['Date']
date_higher = combine_df.iloc[higher]['Date']
print(date_lower)
print(date_higher)
count = combine_df.loc[combine_df['Date'].between(date_lower,date_higher,inclusive='both')].shape[0]
print(count)

2013-02-19 00:00:00
2019-05-28 00:00:00
4909


In [46]:
combine_df.head(5)

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,...,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target,Date
9299,5145770,0.0,1.0,1.0,0.0,0.0,0.0,0,4.0,6.0,...,112500.0,37.429927,5.404628,Working,Higher education,Married,House / apartment,Core staff,0.0,2010-01-01
2800,5041802,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,2.0,...,90000.0,44.825014,10.677837,Working,Secondary / secondary special,Married,House / apartment,Laborers,0.0,2010-01-02
2071,5033917,1.0,0.0,1.0,0.0,0.0,0.0,0,0.0,2.0,...,180000.0,35.63386,9.985147,Working,Secondary / secondary special,Married,House / apartment,Other,0.0,2010-01-02
2977,5045434,1.0,0.0,0.0,0.0,0.0,0.0,0,1.0,3.0,...,135000.0,39.79274,18.212557,Working,Secondary / secondary special,Married,Municipal apartment,Laborers,1.0,2010-01-02
8901,5137353,1.0,1.0,0.0,1.0,1.0,0.0,0,2.0,4.0,...,72000.0,33.758393,5.960424,Working,Secondary / secondary special,Married,House / apartment,Drivers,0.0,2010-01-03


### Vincularity

### Validity

In [56]:
# Compliance
def getCompliance(df_list):
    # total number of datasets
    nds_df_list = len(df_list)
    sum_mds_comp = 0
    
    for i in df_list:
        num_comp_rec = i['PCI_compliant'].value_counts()['Yes']
        total_rec = i.shape[0]
        i_comp = num_comp_rec/total_rec
        sum_mds_comp += i_comp
    
    mds_compliance = sum_mds_comp/nds_df_list
    
    return mds_compliance

# Credability
def getCredability(df_list):
    # total number of datasets
    nds_df_list = len(df_list)
    num_cred_dataset = 0
    
    for i in df_list:
        num_cred_dataset += 1
    
    mds_credability = num_cred_dataset/num_cred_dataset
    
    return mds_credability

In [52]:
# Compliance for Time frame 1
getCompliance([df_t1])

0.92125

In [53]:
# Compliance for Time frame 1
getCompliance([df_t2])

0.9163333333333333

In [54]:
# Compliance for Time frame 1
getCompliance([df_t3])

0.923974540311174

In [57]:
# Compliance for Time frame 1
getCredability([df_t1])

1.0

In [58]:
# Compliance for Time frame 1
getCredability([df_t2])

1.0

In [59]:
# Compliance for Time frame 1
getCredability([df_t3])

1.0