# 1. Procedure

Example of gathering image using webcam

In [None]:
import cv2
from google.colab.patches import cv2_imshow # I modified this and replaced .imshow to _imshow since .imshow is disabled
key = cv2. waitKey(1)
webcam = cv2.VideoCapture(0)
while True:
  try:
    check, frame = webcam.read()
    print(check) #prints true as long as the webcam is running
    print(frame) #prints matrix values of each framecd
    cv2_imshow(frame)
    key = cv2.waitKey(1)
    if key == ord('s'):
      cv2.imwrite(filename='saved_img.jpg', img=frame)
      webcam.release()
      img_new = cv2.imread('saved_img.jpg', cv2.IMREAD_GRAYSCALE)
      img_new = cv2.imshow("Captured Image", img_new)
      cv2.waitKey(1650)
      cv2.destroyAllWindows()
      print("Processing image...")
      img_ = cv2.imread('saved_img.jpg', cv2.IMREAD_ANYCOLOR)
      print("Converting RGB image to grayscale...")
      gray = cv2.cvtColor(img_, cv2.COLOR_BGR2GRAY)
      print("Converted RGB image to grayscale...")
      print("Resizing image to 28x28 scale...")
      img_ = cv2.resize(gray,(28,28))
      print("Resized...")
      img_resized = cv2.imwrite(filename='saved_img-final.jpg', img=img_)
      print("Image saved!")
      break

    elif key == ord('q'):
      print("Turning off camera.")
      webcam.release()
      print("Camera off.")
      print("Program ended.")
      cv2.destroyAllWindows()
      break

  except(KeyboardInterrupt):
    print("Turning off camera.")
    webcam.release()
    print("Camera off.")
    print("Program ended.")
    cv2.destroyAllWindows()
    break

Example of gathering voice data using microphone

In [None]:
!pip3 install sounddevice

In [None]:
!pip3 install wavio

In [None]:
!pip3 install scripy

In [None]:
!apt-get install libportaudio2

In [None]:
# import required libraries
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

# Sampling frequency
freq = 44100

# Recording duration
duration = 5

# Start recorder with the given values
# of duration and sample frequency
recording = sd.rec(int(duration * freq), samplerate=freq, channels=2)

# Record audio for the given number of seconds
sd.wait()

# This will convert the NumPy array to an audio
# file with the given sampling frequency
write("recording0.wav", freq, recording)

# Convert the NumPy array to audio file
wv.write("recording1.wav", recording, freq, sampwidth=2)

Image Scraping using BeautifulSoup

In [1]:
from bs4 import BeautifulSoup

In [2]:
from requests import get

In [4]:
htmldata = get("https://www.google.com/").text
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.find_all('img'):
 print(item['src'])

/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png


Scraping a web using BeautifulSoup

In [33]:
url = 'https://m.imdb.com/search/title/?release_date=2017-01-01,2017-12-31&sort=num_votes,desc'
response = get(url)
print(response.text[:500])

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
</body>
</html>



In [None]:
# Error 403 forbidden
# IMDB doesn't allow for simple scraping such as this anymore, I think
# I will just try to scan the given sample, and cite some external information about scraping some web

# 2. Scrape a web of your own

In [6]:
# I am obtaining a dataset for wikipedia
# It is about a list of criminals that planned to escape by a helicopter
from bs4 import BeautifulSoup
import requests
url = requests.get('https://en.wikipedia.org/wiki/List_of_helicopter_prison_escapes')
parsed = BeautifulSoup(url.text, 'html')

In [7]:
# Extraction the needed elements to make the dataframe
table = parsed.find_all('table')[1]

In [8]:
# Taking the header and the rows of the needed table
headers = table.find_all('th')
rows = table.find_all('tr')

In [9]:
# Making the dataframe
import pandas as pd

In [10]:
# Transform the headers text into a list
# Then make it as an index of the dataframe
escape_helicopter = pd.DataFrame(columns = [col.text.strip() for col in headers])

In [11]:
# Add the rows into the dataframe
# I will add each data of the row one by one by looping
for row in rows[1:]:
  datlist = [r.text.strip() for r in row.find_all('td')]
  escape_helicopter.loc[len(escape_helicopter)] = datlist

In [12]:
escape_helicopter.head()

Unnamed: 0,Date,Prison name,Country,Succeeded,Escapee(s),Details
0,"August 19, 1971",Santa Martha Acatitla,Mexico,Yes,Joel David Kaplan\nCarlos Antonio Contreras Ca...,Kaplan was a New York businessman who had been...
1,"October 31, 1973","Mountjoy Jail, Dublin",Ireland,Yes,JB O'HaganSeamus TwomeyKevin Mallon,An IRA member hijacked a helicopter and forced...
2,"May 24, 1978","United States Penitentiary, Marion, Illinois",United States,No,Garrett Brock TrapnellMartin Joseph McNallyJam...,43-year-old Barbara Ann Oswald hijacked a Sain...
3,"February 27, 1981","Fleury-Mérogis, Essonne, Ile de France",France,Yes,Gérard DupréDaniel Beaumont,"With the help of Serge Coutel, Dupré and Beaum..."
4,"May 7, 1981","Orsainville Prison, Quebec City",Canada,No,Marina Paquet (hijacker)Giles Arseneault (pris...,Paquet held a sawed-off shotgun against the ba...


In [13]:
# Cleaning
escape_helicopter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         50 non-null     object
 1   Prison name  50 non-null     object
 2   Country      50 non-null     object
 3   Succeeded    50 non-null     object
 4   Escapee(s)   50 non-null     object
 5   Details      50 non-null     object
dtypes: object(6)
memory usage: 2.7+ KB


In [14]:
# Make date a proper datetime
escape_helicopter.Date = pd.to_datetime(escape_helicopter.Date)

ValueError: time data "January, 1983" doesn't match format "%B %d, %Y", at position 5. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [15]:
# Noticed an error in the formatting of the dates
# I will specify the format to mixed
escape_helicopter.Date = pd.to_datetime(escape_helicopter.Date, format = 'mixed')

In [16]:
escape_helicopter.head()

Unnamed: 0,Date,Prison name,Country,Succeeded,Escapee(s),Details
0,1971-08-19,Santa Martha Acatitla,Mexico,Yes,Joel David Kaplan\nCarlos Antonio Contreras Ca...,Kaplan was a New York businessman who had been...
1,1973-10-31,"Mountjoy Jail, Dublin",Ireland,Yes,JB O'HaganSeamus TwomeyKevin Mallon,An IRA member hijacked a helicopter and forced...
2,1978-05-24,"United States Penitentiary, Marion, Illinois",United States,No,Garrett Brock TrapnellMartin Joseph McNallyJam...,43-year-old Barbara Ann Oswald hijacked a Sain...
3,1981-02-27,"Fleury-Mérogis, Essonne, Ile de France",France,Yes,Gérard DupréDaniel Beaumont,"With the help of Serge Coutel, Dupré and Beaum..."
4,1981-05-07,"Orsainville Prison, Quebec City",Canada,No,Marina Paquet (hijacker)Giles Arseneault (pris...,Paquet held a sawed-off shotgun against the ba...


In [17]:
escape_helicopter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         50 non-null     datetime64[ns]
 1   Prison name  50 non-null     object        
 2   Country      50 non-null     object        
 3   Succeeded    50 non-null     object        
 4   Escapee(s)   50 non-null     object        
 5   Details      50 non-null     object        
dtypes: datetime64[ns](1), object(5)
memory usage: 2.7+ KB


In [18]:
# Check for duplicates
escape_helicopter.duplicated().sum()

np.int64(0)

In [19]:
# Rename the columns to lowercases only and remove 2 worded colums
# For convenience
escape_helicopter.rename(columns = (lambda x: x.lower().split()[0]), inplace = True)

In [20]:
escape_helicopter.columns

Index(['date', 'prison', 'country', 'succeeded', 'escapee(s)', 'details'], dtype='object')

In [21]:
escape_helicopter.head()

Unnamed: 0,date,prison,country,succeeded,escapee(s),details
0,1971-08-19,Santa Martha Acatitla,Mexico,Yes,Joel David Kaplan\nCarlos Antonio Contreras Ca...,Kaplan was a New York businessman who had been...
1,1973-10-31,"Mountjoy Jail, Dublin",Ireland,Yes,JB O'HaganSeamus TwomeyKevin Mallon,An IRA member hijacked a helicopter and forced...
2,1978-05-24,"United States Penitentiary, Marion, Illinois",United States,No,Garrett Brock TrapnellMartin Joseph McNallyJam...,43-year-old Barbara Ann Oswald hijacked a Sain...
3,1981-02-27,"Fleury-Mérogis, Essonne, Ile de France",France,Yes,Gérard DupréDaniel Beaumont,"With the help of Serge Coutel, Dupré and Beaum..."
4,1981-05-07,"Orsainville Prison, Quebec City",Canada,No,Marina Paquet (hijacker)Giles Arseneault (pris...,Paquet held a sawed-off shotgun against the ba...


In [22]:
# Maybe it is more appropriate to remove the details
escape_helicopter.drop(columns = 'details', inplace = True)

In [23]:
escape_helicopter.columns

Index(['date', 'prison', 'country', 'succeeded', 'escapee(s)'], dtype='object')

In [24]:
escape_helicopter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        50 non-null     datetime64[ns]
 1   prison      50 non-null     object        
 2   country     50 non-null     object        
 3   succeeded   50 non-null     object        
 4   escapee(s)  50 non-null     object        
dtypes: datetime64[ns](1), object(4)
memory usage: 2.3+ KB


In [25]:
# Check the whole transformed dataset
escape_helicopter

Unnamed: 0,date,prison,country,succeeded,escapee(s)
0,1971-08-19,Santa Martha Acatitla,Mexico,Yes,Joel David Kaplan\nCarlos Antonio Contreras Ca...
1,1973-10-31,"Mountjoy Jail, Dublin",Ireland,Yes,JB O'HaganSeamus TwomeyKevin Mallon
2,1978-05-24,"United States Penitentiary, Marion, Illinois",United States,No,Garrett Brock TrapnellMartin Joseph McNallyJam...
3,1981-02-27,"Fleury-Mérogis, Essonne, Ile de France",France,Yes,Gérard DupréDaniel Beaumont
4,1981-05-07,"Orsainville Prison, Quebec City",Canada,No,Marina Paquet (hijacker)Giles Arseneault (pris...
5,1983-01-01,Pentridge (HM Prison),Australia,No,David McMillan
6,1985-12-19,"Perry Correctional Institution, Pelzer, South ...",United States,Yes,James Rodney LeonardWilliam Douglas BallewJess...
7,1985-12-31,"Cândido Mendes penitentiary, Ilha Grande, Rio ...",Brazil,Yes,"José Carlos dos Reis Encina, a.k.a. ""Escadinha"""
8,1986-05-26,Prison de la Santé,France,Yes,Michel Vaujour
9,1986-11-05,"Federal Correctional Institution, Dublin, Cali...",United States,Yes,Samantha Lopez


In [26]:
# Save the dataframe to csv file
escape_helicopter.to_csv('helicopter_prison_escape.csv', index = False)

In [27]:
# Load the csv to check
df = pd.read_csv('helicopter_prison_escape.csv')

In [28]:
df.head()

Unnamed: 0,date,prison,country,succeeded,escapee(s)
0,1971-08-19,Santa Martha Acatitla,Mexico,Yes,Joel David Kaplan\nCarlos Antonio Contreras Ca...
1,1973-10-31,"Mountjoy Jail, Dublin",Ireland,Yes,JB O'HaganSeamus TwomeyKevin Mallon
2,1978-05-24,"United States Penitentiary, Marion, Illinois",United States,No,Garrett Brock TrapnellMartin Joseph McNallyJam...
3,1981-02-27,"Fleury-Mérogis, Essonne, Ile de France",France,Yes,Gérard DupréDaniel Beaumont
4,1981-05-07,"Orsainville Prison, Quebec City",Canada,No,Marina Paquet (hijacker)Giles Arseneault (pris...
