In [1]:
import unittest
import source.check_data as chd
import pandas as pd
import numpy as np
from pathlib import Path
import os
import random

## Make test data: 

In [2]:
random.seed(43)

### Complex test data: 

#### Random floats between 0 and 8 to simulate time stamps measured in hours, then sort them from smallest to biggest:

In [3]:
round_counter = 0
random_floats = []
while round_counter < 10:
    random_float = random.uniform(0, 8)
    random_floats.append(random_float)
    round_counter += 1
random_floats

[0.30841471469904036,
 5.5697945810964224,
 1.1514657711628882,
 3.7002580386327004,
 5.373174112942136,
 6.343610173242355,
 3.6255138277296988,
 3.9861778383844095,
 0.15325686419478224,
 3.4591107116115216]

In [4]:
np.array(random_floats).dtype

dtype('float64')

In [5]:
random_floats.sort()
random_floats

[0.15325686419478224,
 0.30841471469904036,
 1.1514657711628882,
 3.4591107116115216,
 3.6255138277296988,
 3.7002580386327004,
 3.9861778383844095,
 5.373174112942136,
 5.5697945810964224,
 6.343610173242355]

#### Put everything into a dataframe:

In [6]:
test_data_complex = pd.DataFrame({'pseudo_time_stamp': random_floats})
test_data_complex

Unnamed: 0,pseudo_time_stamp
0,0.153257
1,0.308415
2,1.151466
3,3.459111
4,3.625514
5,3.700258
6,3.986178
7,5.373174
8,5.569795
9,6.34361


#### Categorize data into bins according to one hour time intervals by adding time stamp bins:

In [7]:
test_data_complex["time_stamps_bins"] = pd.cut(test_data_complex["pseudo_time_stamp"],
                               bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, np.inf],
                               labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
test_data_complex

Unnamed: 0,pseudo_time_stamp,time_stamps_bins
0,0.153257,1
1,0.308415,1
2,1.151466,2
3,3.459111,4
4,3.625514,4
5,3.700258,4
6,3.986178,4
7,5.373174,6
8,5.569795,6
9,6.34361,7


### Engineer the number of different subject Ids

In [9]:
# Number of different categories per bin:

pseudo_subjects_1 = ['Sub_01', 'Sub_01'] # First bin, one category.
b1 = 1
pseudo_subjects_2 = ['Sub_02'] # Second bin, one category.
b2 = 1
pseudo_subjects_4 = ['Sub_02', 'Sub_03', 'Sub_02', 'Sub_01'] # Fourth bin, three different categories.
b4 = 3
pseudo_subjects_6 = ['Sub_04', 'Sub_01'] # Sixth bin, two categories.
b6 = 2
pseudo_subjects_7 = ['Sub_02'] # Seventh bin, one category.
b7 = 1

In [10]:
cat_num_list_complex = [b1] + [b2] + [b4] + [b6] + [b7]
cat_num_list_complex
time_bin_list_complex = [1, 2, 4, 6, 7]
time_bin_list_complex

[1, 2, 4, 6, 7]

In [11]:
print(len(set(pseudo_subjects_1)))
print(b1)
print(len(set(pseudo_subjects_2)))
print(b2)
print(len(set(pseudo_subjects_4)))
print(b4)
print(len(set(pseudo_subjects_6)))
print(b6)

1
1
1
1
3
3
2
2


In [12]:
pseudo_subjects = pseudo_subjects_1 + pseudo_subjects_2 + pseudo_subjects_4 + \
pseudo_subjects_6 +pseudo_subjects_7

In [13]:
test_data_complex['subject'] = pseudo_subjects
test_data_complex

Unnamed: 0,pseudo_time_stamp,time_stamps_bins,subject
0,0.153257,1,Sub_01
1,0.308415,1,Sub_01
2,1.151466,2,Sub_02
3,3.459111,4,Sub_02
4,3.625514,4,Sub_03
5,3.700258,4,Sub_02
6,3.986178,4,Sub_01
7,5.373174,6,Sub_04
8,5.569795,6,Sub_01
9,6.34361,7,Sub_02


### Engineer desired output based on the test-data:

In [14]:
desired_output_complex = pd.DataFrame({'time_bin': time_bin_list_complex, 
                                       'number_unique_subjects': cat_num_list_complex})
desired_output_complex

Unnamed: 0,time_bin,number_unique_subjects
0,1,1
1,2,1
2,4,3
3,6,2
4,7,1


### Simple test data 1:

#### Choose floats simulating time stamps:

In [15]:
floats = np.array([0.2, 1.5, 1.9])

In [16]:
test_data_simple_1 = pd.DataFrame({'pseudo_time_stamp': floats})
test_data_simple_1

Unnamed: 0,pseudo_time_stamp
0,0.2
1,1.5
2,1.9


#### Categorize time stamps into bins by one hour intervals:

In [17]:
test_data_simple_1["time_stamps_bins"] = pd.cut(test_data_simple_1["pseudo_time_stamp"],
                               bins=[0, 1, np.inf],
                               labels=[1, 2])

In [18]:
test_data_simple_1

Unnamed: 0,pseudo_time_stamp,time_stamps_bins
0,0.2,1
1,1.5,2
2,1.9,2


#### Engineer the number of different subject Ids

In [19]:
# Number of different categories per bin:
pseudo_subjects_1_simple_1 = ['Sub_01'] # First bin, one category.
b1_simple_1 = 1
pseudo_subjects_2_simple_1 = ['Sub_01', 'Sub_02'] # Second bin, two categories.
b2_simple_1 = 2

In [20]:
# List of category numbers:
cat_num_list_simple_1 = [b1_simple_1] + [b2_simple_1]
print(cat_num_list_simple_1)
# List of hour bins:
time_bin_list_simple_1 = [1, 2]
print(time_bin_list_simple_1)

[1, 2]
[1, 2]


In [21]:
# Check number of subjects in bin 1:
print('number of subjects in bin 1:')
print(len(set(pseudo_subjects_1_simple_1)))
print(b1_simple_1)
# Check number of subjects in bin 2:
print('number of subjects in bin 2:')
print(len(set(pseudo_subjects_2_simple_1)))
print(b2_simple_1)

number of subjects in bin 1:
1
1
number of subjects in bin 2:
2
2


In [22]:
pseudo_subjects_simple_1 = pseudo_subjects_1_simple_1 + pseudo_subjects_2_simple_1

In [23]:
test_data_simple_1['subject'] = pseudo_subjects_simple_1
test_data_simple_1

Unnamed: 0,pseudo_time_stamp,time_stamps_bins,subject
0,0.2,1,Sub_01
1,1.5,2,Sub_01
2,1.9,2,Sub_02


### Engineer desired output based on the test-data:

In [24]:
desired_output_simple_1 = pd.DataFrame({'time_bin': time_bin_list_simple_1, 
                                      'number_unique_subjects': cat_num_list_simple_1})
desired_output_simple_1

Unnamed: 0,time_bin,number_unique_subjects
0,1,1
1,2,2


### Choose test scenario:
Simple or complex data, desired_output matching or not, time_bin_list matching or not:
Not matching output is a test for the test procedure.
Not matching time bin list is testing if the function fails whith a wrong time bin list.

In [36]:
# Test data simple, desired output matching:
test_data = test_data_simple_1
desired_output = desired_output_simple_1
time_bin_list = time_bin_list_simple_1
# Expected outcome: Test OK!

In [43]:
# Test data complex desired output matching:
test_data = test_data_complex
desired_output = desired_output_complex
time_bin_list = time_bin_list_complex
# Expected outcome: Test OK!

In [44]:
# Test data complex desired output not matching:
#test_data = test_data_complex
#desired_output = desired_output_complex
#time_bin_list = time_bin_list_simple_1
# Expected outcome: Test failed!

In [45]:
# Test data complex desired time_bin_list not matching:
#test_data = test_data_complex
#desired_output = desired_output_complex
#time_bin_list = time_bin_list_simple_1
# Expected outcome: Test failed!

### Visually compare output to desired output:

In [46]:
chd.count_unique_subjects_per_hour(test_data, time_bin_list)

Unnamed: 0,time_bin,number_unique_subjects
0,1,1
1,2,1
2,4,3
3,6,2
4,7,1


In [47]:
desired_output

Unnamed: 0,time_bin,number_unique_subjects
0,1,1
1,2,1
2,4,3
3,6,2
4,7,1


## Implement Unit-Test:

In [48]:
class TestCheckData(unittest.TestCase):
    def setUp(self):
        self.data = test_data
    # tests go here:
    def test_inputExists(self):
        self.assertIsNotNone(self.data)
        
    def test_inputType(self):
        self.assertIsInstance(self.data, pd.core.frame.DataFrame)
        
    def test_functReturnsSomething(self):
        self.assertIsNotNone(chd.count_unique_subjects_per_hour(self.data, time_bin_list))

    def test_outputType(self):
        self.assertIsInstance(chd.count_unique_subjects_per_hour(self.data, time_bin_list), pd.core.frame.DataFrame)

    def test_outputDimensions(self):
        df_output = chd.count_unique_subjects_per_hour(self.data, time_bin_list)
        df_output_desired = desired_output
        self.assertEqual(True, df_output.shape == df_output_desired.shape)
    
    def test_outputEqualsDesiredOutput(self):
        df_output = chd.count_unique_subjects_per_hour(self.data, time_bin_list)
        df_output_desired = desired_output
        self.assertEqual(True, df_output.equals(df_output_desired))

        
    
#if __name__ == "__main__":
 #   unittest.main()

unittest.main(argv=[''], verbosity=2, exit=False)
    

test_functReturnsSomething (__main__.TestCheckData.test_functReturnsSomething) ... ok
test_inputExists (__main__.TestCheckData.test_inputExists) ... ok
test_inputType (__main__.TestCheckData.test_inputType) ... ok
test_outputDimensions (__main__.TestCheckData.test_outputDimensions) ... ok
test_outputEqualsDesiredOutput (__main__.TestCheckData.test_outputEqualsDesiredOutput) ... ok
test_outputType (__main__.TestCheckData.test_outputType) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.016s

OK


<unittest.main.TestProgram at 0xffff47f78e60>