<a href="https://colab.research.google.com/github/Janina712/MLTSA_FinalProject/blob/main/edf_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Here we collect all `.edf` files we need

In [11]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
import shutil
from collections import Counter
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
root_link = 'https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/'

In [16]:
drive_tuh_eeg_loc = '/content/drive/MyDrive/Classes/22S-PHYS-667/project/isip_tuh_eeg'

## Getting all downloadable links

In [None]:
login = '**'
passwd = '**'

In [4]:
def get_suff_list(link, folders:bool=True):

  soup = BeautifulSoup(
    requests.get(
        link,
        auth=(
          login,
          passwd,
        )
    ).text
  )

  current_suff_list = [
    line.get('href') for line in (soup.find_all('a')) if (
      (folders and str(line.text).count('/') == 1 and not str(line.text).count('.')) or
      (not folders and not str(line.text).count('/') and str(line.text).count('.') == 1)
    )
  ]

  return current_suff_list

In [None]:
current_root_link = root_link
cur_folders = [root_link]
files_to_download = []

iter = 0

while cur_folders:
  iter += 1
  current_root_link = cur_folders[0]
  folds_to_check = get_suff_list(current_root_link, folders = True)
  cur_folders.remove(current_root_link)
  if not folds_to_check:
    files_to_download += [
        current_root_link + fname for fname in get_suff_list(current_root_link, folders = False)
    ]

  for suff in folds_to_check:
    cur_folders.append(
        current_root_link + suff
    )
  if not iter % 100:
    print(f'iter: {iter}, curr_root: {current_root_link}, list: {len(cur_folders)}, files: {len(files_to_download)}')

  

iter: 100, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/eval/03_tcp_ar_a/049/, list: 347, files: 0
iter: 200, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/train/01_tcp_ar/126/, list: 538, files: 0
iter: 300, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/train/03_tcp_ar_a/036/, list: 737, files: 0
iter: 400, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/dev/03_tcp_ar_a/059/00005943/, list: 933, files: 0
iter: 500, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/train/01_tcp_ar/061/00006103/, list: 1084, files: 0
iter: 600, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/train/01_tcp_ar/089/00008971/, list: 1214, files: 0
iter: 700, curr_root: https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_

### Saving

In [None]:
with open('all_tuh_eeg_seizure_links.txt', 'w') as f:
  for line in files_to_download:
    f.write(f'{line}\n')

## Downloading files

In [6]:
links_to_download = []

with open('/content/drive/MyDrive/Classes/22S-PHYS-667/project/all_tuh_eeg_seizure_links.txt', 'r') as f:
  for line in f:
    line = line.strip()
    if line:
      links_to_download.append(line)

In [7]:
links_to_download[:3]

['https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/dev/01_tcp_ar/002/00000258/s002_2003_07_21/00000258_s002.txt',
 'https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/dev/01_tcp_ar/002/00000258/s002_2003_07_21/00000258_s002_t000.edf',
 'https://isip.piconepress.com/projects/tuh_eeg/downloads/tuh_eeg_seizure/v1.5.2/edf/dev/01_tcp_ar/002/00000258/s002_2003_07_21/00000258_s002_t000.lbl']

In [8]:
len(links_to_download)

22503

In [10]:
edf_links_to_download = [link for link in links_to_download if link[-4:] == '.tse']
len(edf_links_to_download)

6635

In [11]:
file_extensions_counter = Counter(
    [fname[-3:] for fname in links_to_download]
)
file_extensions_counter.most_common()

[('edf', 6635), ('lbl', 6635), ('tse', 6635), ('txt', 1575), ('rec', 1023)]

In [12]:
'__'.join(edf_links_to_download[3].split('/')[9:])

'dev__01_tcp_ar__002__00000258__s003_2003_07_22__00000258_s003_t001.tse'

In [13]:
target_folder = '/content/drive/MyDrive/Classes/22S-PHYS-667/project/isip_tuh_eeg/'

In [14]:
for link in tqdm(edf_links_to_download[:]):
  out_fname = '__'.join(
      link.split('/')[9:]
  )
  with open(Path(target_folder).joinpath(out_fname), 'wb') as f:
    binary_data = requests.get(
        link,
        stream=True,
        auth=(
          login,
          passwd,
        )
    )
    shutil.copyfileobj(binary_data.raw, f)

100%|██████████| 6635/6635 [14:32<00:00,  7.61it/s]


## Reading files from the folder

In [19]:
downloaded_files = list(Path(drive_tuh_eeg_loc).glob('*'))
downloaded_files[:3]

[PosixPath('/content/drive/MyDrive/Classes/22S-PHYS-667/project/isip_tuh_eeg/train__02_tcp_le__058__00005804__s002_2008_11_05__00005804_s002_t003.tse'),
 PosixPath('/content/drive/MyDrive/Classes/22S-PHYS-667/project/isip_tuh_eeg/train__02_tcp_le__058__00005804__s002_2008_11_05__00005804_s002_t004.tse'),
 PosixPath('/content/drive/MyDrive/Classes/22S-PHYS-667/project/isip_tuh_eeg/train__02_tcp_le__058__00005804__s002_2008_11_05__00005804_s002_t005.tse')]

In [15]:
pd.DataFrame(
    [s.name.split('__') for s in downloaded_files[:10]],
    columns=[
      'type',
      2, #??
      3,
      4,
      5,
      'filename'
    ]
    )

Unnamed: 0,type,2,3,4,5,filename
0,train,02_tcp_le,58,5804,s002_2008_11_05,00005804_s002_t003.tse
1,train,02_tcp_le,58,5804,s002_2008_11_05,00005804_s002_t004.tse
2,train,02_tcp_le,58,5804,s002_2008_11_05,00005804_s002_t005.tse
3,train,02_tcp_le,58,5804,s002_2008_11_05,00005804_s002_t006.tse
4,train,02_tcp_le,60,6083,s003_2010_10_11,00006083_s003_t001.tse
5,train,02_tcp_le,60,6083,s004_2010_10_12,00006083_s004_t001.tse
6,train,02_tcp_le,60,6083,s004_2010_10_12,00006083_s004_t002.tse
7,train,02_tcp_le,60,6087,s005_2011_04_11,00006087_s005_t000.tse
8,train,02_tcp_le,60,6087,s006_2011_04_12,00006087_s006_t000.tse
9,train,02_tcp_le,61,6134,s001_2009_08_07,00006134_s001_t000.tse
