# Getting Data

In [3]:
import os
import urllib3
import requests
import shutil
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

In [4]:
# Getting all the url for the files
url = 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA1/'
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data)

In [5]:
# Getting the video urls
video_url_list = []
for link in soup.find_all('a'):
    if link.get('href').startswith('..') and link.get('href').endswith('mpg'):
        video_url_list.append(os.path.join(url.replace('1', '2') ,link.get('href').replace('../CAVIARDATA2/', '')))
video_url_list

['https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/WalkByShop1cor/WalkByShop1cor.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/WalkByShop1front/WalkByShop1front.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths1cor/EnterExitCrossingPaths1cor.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths1front/EnterExitCrossingPaths1front.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths2cor/EnterExitCrossingPaths2cor.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths2front/EnterExitCrossingPaths2front.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop1cor/OneLeaveShop1cor.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop1front/OneLeaveShop1front.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop2cor/OneLeaveShop2cor.mpg',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop2front/OneLeaveShop2front.mpg',
 'https://homepage

In [6]:
# Getting the images urls
image_url_list = []
for link in soup.find_all('a'):
    if link.get('href').startswith('..') and link.get('href').endswith('gz'):
        image_url_list.append(os.path.join(url.replace('1', '2') ,link.get('href').replace('../CAVIARDATA2/', '')))
image_url_list

['https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/WalkByShop1cor/WalkByShop1cor.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/WalkByShop1front/WalkByShop1front.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths1cor/EnterExitCrossingPaths1cor.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths1front/EnterExitCrossingPaths1front.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths2cor/EnterExitCrossingPaths2cor.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/EnterExitCrossingPaths2front/EnterExitCrossingPaths2front.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop1cor/OneLeaveShop1cor.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop1front/OneLeaveShop1front.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop2cor/OneLeaveShop2cor.tar.gz',
 'https://homepages.inf.ed.ac.uk/rbf/CAVIARDATA2/OneLeaveShop2front/OneLeaveShop2fro

## Downloading the Data

In [7]:
videos_dir = 'raw_data/videos'
images_dir = 'raw_data/images'

if not os.path.exists(videos_dir):
    os.makedirs(videos_dir)

if not os.path.exists(images_dir):
    os.makedirs(images_dir)

In [9]:
# Downloading the video data
for path in tqdm(video_url_list):
    response = requests.get(url=path)
    filename = path.split('/')[-1]
    open(os.path.join(videos_dir, filename), 'wb').write(response.content)
print('[INFO] All the video files are downloaded')

  0%|          | 0/52 [00:00<?, ?it/s]

[INFO] All the video files are downloaded


In [10]:
# Creating a temp file
temp_dir = 'raw_data/temp'
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

# Downloading and extracting the images data
for path in tqdm(image_url_list):
    response = requests.get(url=path)
    filename = os.path.join(temp_dir, path.split('/')[-1])
    open(filename, 'wb').write(response.content)
    shutil.unpack_archive(filename, images_dir)
print('[INFO] All the Image files are downloaded and extracted.')

  0%|          | 0/52 [00:00<?, ?it/s]

[INFO] All the Image files are downloaded and extracted.


In [11]:
# Downloading mall image dataset
mall_data_url = 'http://personal.ie.cuhk.edu.hk/~ccloy/files/datasets/mall_dataset.zip'
raw_data_dir = 'raw_data'

with urlopen(mall_data_url) as f:
    with ZipFile(BytesIO(f.read())) as zfile:
        zfile.extractall(raw_data_dir)
print('[INFO] All the mall data is downloaded and extracted.')

[INFO] All the mall data is downloaded and extracted.
