# **Create_a_test_set.ipynb**

In order to test the final model at the end I am going to separate out a test set that will not be used in the training or validation.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import warnings
warnings.simplefilter("ignore")

folder = '/content/drive/My Drive/Colab Notebooks/COVID-19'
if not os.path.exists(folder):
  print(folder + ' does not exist')


Read in dataframes created previously.

In [5]:
df_healthy = pd.read_csv(os.path.join(folder, 'df_healthy.csv'), index_col=0)
df_covid = pd.read_csv(os.path.join(folder, 'df_covid.csv'), index_col=0)
pd.set_option('max_columns', None)
print(df_healthy.head(5))
print(df_covid.head(5))
print(df_healthy.shape)

                                     id   age  gender   status  \
0  00039425-7f3a-42aa-ac13-834aaa2b6b92  15.0    male  healthy   
1  0009eb28-d8be-4dc1-92bb-907e53bc5c7a  34.0    male  healthy   
2  001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f  21.0    male  healthy   
3  001e2f19-d81c-4029-b33c-d2db56b23a4a  20.0    male  healthy   
4  0028b68c-aca4-4f4f-bb1d-cb4ed5bbd952  28.0  female  healthy   

                                          audio_file  \
0  /content/drive/My Drive/Colab Notebooks/coughv...   
1  /content/drive/My Drive/Colab Notebooks/coughv...   
2  /content/drive/My Drive/Colab Notebooks/coughv...   
3  /content/drive/My Drive/Colab Notebooks/coughv...   
4  /content/drive/My Drive/Colab Notebooks/coughv...   

                                          image_file    source  cough_detected  
0  /content/drive/My Drive/Colab Notebooks/coughv...  coughvid          0.9609  
1  /content/drive/My Drive/Colab Notebooks/coughv...  coughvid          0.9301  
2  /content/drive/My Dr

The rows are in order of source, i.e. all the coughvid are first then coswara and finally virufy. Hence I am going to shuffle them before taking some for a test set.

In [6]:
df_healthy = df_healthy.sample(frac=1).reset_index(drop=True)
df_covid = df_covid.sample(frac=1).reset_index(drop=True)
print(df_healthy.head(5))
print(df_covid.head(5))

                                     id   age gender   status  \
0  78be0aca-207b-4a42-8c7d-de4ed3e8b5b7  24.0   male  healthy   
1  ce0c319b-da2c-4f90-9050-108a8156acb5  38.0   male  healthy   
2  ab27b681-d095-47d5-834e-31630018415b  41.0   male  healthy   
3  ef2d6bd0-f305-409d-b430-583b67988e07  18.0   male  healthy   
4  3585c07f-9f17-4b76-9994-0cdb4d314f84  43.0   male  healthy   

                                          audio_file  \
0  /content/drive/My Drive/Colab Notebooks/coughv...   
1  /content/drive/My Drive/Colab Notebooks/coughv...   
2  /content/drive/My Drive/Colab Notebooks/coughv...   
3  /content/drive/My Drive/Colab Notebooks/coughv...   
4  /content/drive/My Drive/Colab Notebooks/coughv...   

                                          image_file    source  cough_detected  
0  /content/drive/My Drive/Colab Notebooks/coughv...  coughvid          0.9947  
1  /content/drive/My Drive/Colab Notebooks/coughv...  coughvid          1.0000  
2  /content/drive/My Drive/Co

In [7]:
print(df_healthy.shape)
print(df_covid.shape)

(11134, 8)
(1267, 8)


I am going to create a test set of 1000 entries with the same proportions as the data I have. Hence 114 that are covid positive and 886 covid negative (healthy).

In [8]:
healthy_test = df_healthy.iloc[:886]
healthy_train = df_healthy.iloc[886:]

covid_test = df_covid.iloc[:114]
covid_train = df_covid.iloc[114:]

df_test = pd.concat([healthy_test, covid_test])
df_train = pd.concat([healthy_train, covid_train])

#Shuffle rows again so that healthy and COVID-19 are mixed.
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)

print(df_test.shape)
print(df_test.head(10))

print(df_train.shape)
print(df_train.head(10))


(1000, 8)
                                     id   age  gender    status  \
0          y6VWQQ5bW0drHBBQ74CmfwHniKo2  37.0  female  COVID-19   
1  e614bfdf-0264-4094-8585-a1cd4068f503  22.0    male   healthy   
2  b4b8c8d8-37cf-48e5-b5af-12009c442239  30.0    male   healthy   
3  8b04c1fa-4fbe-45f3-9b4a-5cc6a450c43c  32.0    male   healthy   
4  b24347b4-ab14-46c7-9863-c62cd5dd0659  28.0    male   healthy   
5          DqztVX8gWrOi4il3xouSTV4FwyC3  43.0    male  COVID-19   
6          466aPruIIbOEG3V1IaKzu5O8kun2  41.0    male   healthy   
7  ce3f9e37-b553-4c23-b341-8da601f7468e  23.0    male   healthy   
8  843d0600-495e-441c-88f9-18695bf67b24  40.0    male   healthy   
9  9c9d5f33-e8e4-4483-a9ba-992fe633e486  20.0  female  COVID-19   

                                          audio_file  \
0  /content/drive/My Drive/Colab Notebooks/coswar...   
1  /content/drive/My Drive/Colab Notebooks/coughv...   
2  /content/drive/My Drive/Colab Notebooks/coughv...   
3  /content/drive/My Drive/C

Save df_test and df_train dataframes ready to use in the next stage.

In [9]:
df_test.to_csv(os.path.join(folder, 'df_test.csv'))
df_train.to_csv(os.path.join(folder, 'df_train.csv'))

Check files have saved correctly.

In [10]:
df_test = pd.read_csv(os.path.join(folder, 'df_test.csv'), index_col=0)
df_train = pd.read_csv(os.path.join(folder, 'df_train.csv'), index_col=0)

print(df_test.shape)
print(df_test.head(10))

print(df_train.shape)
print(df_train.head(10))

(1000, 8)
                                     id   age  gender    status  \
0          y6VWQQ5bW0drHBBQ74CmfwHniKo2  37.0  female  COVID-19   
1  e614bfdf-0264-4094-8585-a1cd4068f503  22.0    male   healthy   
2  b4b8c8d8-37cf-48e5-b5af-12009c442239  30.0    male   healthy   
3  8b04c1fa-4fbe-45f3-9b4a-5cc6a450c43c  32.0    male   healthy   
4  b24347b4-ab14-46c7-9863-c62cd5dd0659  28.0    male   healthy   
5          DqztVX8gWrOi4il3xouSTV4FwyC3  43.0    male  COVID-19   
6          466aPruIIbOEG3V1IaKzu5O8kun2  41.0    male   healthy   
7  ce3f9e37-b553-4c23-b341-8da601f7468e  23.0    male   healthy   
8  843d0600-495e-441c-88f9-18695bf67b24  40.0    male   healthy   
9  9c9d5f33-e8e4-4483-a9ba-992fe633e486  20.0  female  COVID-19   

                                          audio_file  \
0  /content/drive/My Drive/Colab Notebooks/coswar...   
1  /content/drive/My Drive/Colab Notebooks/coughv...   
2  /content/drive/My Drive/Colab Notebooks/coughv...   
3  /content/drive/My Drive/C