# **Description**

Used in the first iteration to make sure there were no duplicate test IDs or test IDs that were present in predictor files but not the time series files or vice versa. 

In [None]:
import pandas as pd

# Read in the time series data

In [None]:
ecd_ts = pd.read_csv('Data/Time Series/ECDTS/PC_TS.csv')
un_ts = pd.read_csv('Data/Time Series/unsuccesful time series/US_TS.csv')
direct = 'Data/Time Series/successful time series/S_TS_'
succ_ts = pd.concat(map(pd.read_csv, [direct+'1(1).csv', direct+'2(1).csv', 
                                 direct+'3(1).csv', direct+'4(1).csv', direct+'5(1).csv', direct+'6(1).csv', 
                                 direct+'7(1).csv', direct+'8(1).csv']))

# Read in the predictors data

In [None]:
un_pred = pd.read_csv('Data/Raw Data Predictors/Unsuccessful_Readings.csv')
ecd_pred = pd.read_csv('Data/Raw Data Predictors/ECD.csv')
succ_pred = pd.read_csv('Data/Raw Data Predictors/Successful_Readings.csv')

# Check the lengths of the data frames. 

In [None]:
len(ecd_ts)

In [None]:
len(ecd_pred)

In [None]:
len(un_ts)

In [None]:
len(un_pred)

In [None]:
len(succ_ts)

In [None]:
len(succ_pred)

Evidently, there are some mismatches between which tests are present as time series and which are present with summary predictors. 

# Check for duplicates in all the data frames. 

In [None]:
len(ecd_ts) == len(ecd_ts.drop_duplicates())

In [None]:
len(ecd_pred) == len(ecd_pred.drop_duplicates())

ECD files have no duplicates, so we are good there. 

In [None]:
len(un_pred) == len(un_pred.drop_duplicates())

In [None]:
# Drop the duplicates, make sure test IDs are only present once. 
un_pred = un_pred.drop_duplicates()
len(un_pred['TestID'].unique()) == len(un_pred)

In [None]:
len(un_ts) == len(un_ts.drop_duplicates())

In [None]:
len(succ_pred) == len(succ_pred.drop_duplicates())

In [None]:
# Drop the duplicates, make sure test IDs are only present once. 
succ_pred = succ_pred.drop_duplicates()
len(succ_pred['TestID'].unique()) == len(succ_pred)

# Make sure ECD and unsuccesful records are separate

In [None]:
ecd_ts['TestId'][~ecd_ts['TestId'].isin(un_ts['TestId'])]

All of the ECD time series ids are also in the unsuccessful time series ids, so they are duplicated there. 

In [None]:
ecd_pred['TestID'][~ecd_pred['TestID'].isin(un_pred['TestID'])]

None of the ECD predictor IDs are in the unsuccessful predictor file. 

In [None]:
ecd_pred['TestID'][~ecd_pred['TestID'].isin(un_ts['TestId'])]

All the ECD predictors are in the unsuccessful **time series file**. I think this means that the ECD tests are duplicated in the unsuccessful time series records, **but not in the predictor records**.

# Find mismatches in the ECD data. 

In [None]:
s1 = ecd_ts['TestId']
s2 = ecd_pred['TestID']
# Test ids in timeseries data not in predictor data. 
res = s1[~s1.isin(s2)]
#res.to_csv('missing_from_ECD_timeseries.csv', index = False)
res

There are two IDs in the time series data that are not in the predictor data. Let's check if these ID's are in the unsuccessful predictors file. 

In [None]:
un_pred[un_pred['TestID'] == 10474383]

In [None]:
un_pred[un_pred['TestID'] == 10466205]

Nope. The predictor data for test ids 10466205 and 10474383 are just missing from the dataset. 

In [None]:
#Test ids in predictor data not in time series data
s2[~s2.isin(s1)]

# Find mismatches in the unsuccessful data

In [None]:
s1 = un_ts['TestId']
s2 = un_pred['TestID']
# Test ids in timeseries data not in predictor data. 
res = pd.DataFrame(s1[~s1.isin(s2)])
print(str(len(res)) + ' records are present in time series but not in predictors')
#res.to_csv('missing_from_successful_predictors.csv', index = False)
# test ids in predictor data but not in time series
res = pd.DataFrame(s2[~s2.isin(s1)])
print(str(len(res)) + ' records are present in predictors but not in time series')
#res.to_csv('missing_from_successful_timeseries.csv', index = False)

# Find mismatches in the successful data

In [None]:
s1 = succ_ts['TestId']
s2 = succ_pred['TestID']
# Test ids in timeseries data not in predictor data. 
res = pd.DataFrame(s1[~s1.isin(s2)])
#res.to_csv('missing_from_successful_predictors.csv', index = False)
print(str(len(res)) + ' records are present in time series but not in predictors')
# test ids in predictor data but not in time series
res = pd.DataFrame(s2[~s2.isin(s1)])
#res.to_csv('missing_from_successful_timeseries.csv', index = False)
print(str(len(res)) + ' records are present in predictors but not in time series')

# Explore the data

In [None]:
un_pred.head()