# Data Acquisition

### Introduction

### Example from a public dataset

In [3]:
# Import pandas with alias
import pandas as pd
 
# Assign the dataset url as a variable
url = "https://raw.githubusercontent.com/shrikant-temburwar/Iris-Dataset/master/Iris.csv"
 
# Define the column names of dataset as a list
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
 
# Use read_csv to read in data as a pandas dataframe
df = pd.read_csv(url, names=columns)
 
# Check head of dataframe
print(df.head())

     sepal_length   sepal_width   petal_length   petal_width        class
Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
1             5.1           3.5            1.4           0.2  Iris-setosa
2             4.9           3.0            1.4           0.2  Iris-setosa
3             4.7           3.2            1.3           0.2  Iris-setosa
4             4.6           3.1            1.5           0.2  Iris-setosa


### Webscraping Example

In [4]:
# Import libraries 
import pandas as pd
from bs4 import BeautifulSoup
import requests
 
# Assign URL to variable
url = "https://www.codecademy.com/"
 
# Send request to download the data from URL
response = requests.request("GET", url)
 
# Create BeautifulSoup object
# Use HTML parser to parse the page's text
data = BeautifulSoup(response.text, 'html.parser')
 
# Print the first header of the page
print(data.html.h1)
 
# Instantiate list to append some content
content = []
 
# Use BeautifulSoup's find_all method to find all paragraph tags
words = data.find_all('p')
 
# Iterate through all paragraph tags
# append text to list with for loop
for word in words:
    content.append(word.text)
 
# Check content
print(content)
 
# Create dataframe of content with pandas DataFrame method
df = pd.DataFrame(content, columns= ['Text'])
 
# Check scraped dataframe
print(df)

<h1>Join the Millions <br/> Learning to Code <br/> with Codecademy</h1>
["By signing up for Codecademy, you agree to Codecademy's Terms of Service & Privacy Policy.", 'No need to worry, we’ll help you make sense of it all.', "From building websites to analyzing data, the choice is yours. Not sure where to start? We'll point you in the right direction.", "No matter your experience level, you'll be writing real, working code in minutes.", "Your code is tested as soon as you submit it, so you always know if you're on the right track.", 'Apply your learning with real-world projects and test your knowledge with tailor-made quizzes.', 'Coding skills have never been more in-demand. Learn everything you need to take your career to the next level.']
                                                Text
0  By signing up for Codecademy, you agree to Cod...
1  No need to worry, we’ll help you make sense of...
2  From building websites to analyzing data, the ...
3  No matter your experience level, y

### SQL

### Manipulation

**Create**
***
`CREATE TABLE celebs (
   id INTEGER, 
   name TEXT, 
   age INTEGER
);`
***

**Insert**
***
`INSERT INTO celebs (id, name, age) 
VALUES (1, 'Justin Bieber', 22);`
***

**Select**
***
`SELECT * FROM celebs;`
`SELECT name FROM celebs;`
***

**Alter**
***
`ALTER TABLE celebs 
ADD COLUMN twitter_handle TEXT;`
***

**Update**
***
`UPDATE celebs 
SET twitter_handle = '@taylorswift13' 
WHERE id = 4; `
***

**Delete**
***
`DELETE FROM celebs 
WHERE twitter_handle IS NULL;`
***

**Constraints**
***
`CREATE TABLE celebs (
   id INTEGER PRIMARY KEY, 
   name TEXT UNIQUE,
   date_of_birth TEXT NOT NULL,
   date_of_death TEXT DEFAULT 'Not Applicable'
);`

### Queries

**Select**
***
`SELECT column1, column2 
FROM table_name;`
***

**As (alias)**
***
`SELECT name AS 'Titles'
FROM movies;`
***

**Distinct (unique values)**
***
`SELECT DISTINCT tools 
FROM inventory;`
***

**Where**
***
`SELECT *
FROM movies
WHERE imdb_rating > 8;`
***

**Like (wildcards)**
***
`SELECT * 
FROM movies
WHERE name LIKE 'Se_en';`

`SELECT * 
FROM movies
WHERE name LIKE 'A%';`

`SELECT * 
FROM movies 
WHERE name LIKE '%man%';`
***

**Is null**
***
`SELECT name
FROM movies 
WHERE imdb_rating IS NOT NULL;`
***

**Between**
***
`SELECT *
FROM movies
WHERE year BETWEEN 1990 AND 1999;`
***

**And**
***
`SELECT * 
FROM movies
WHERE year BETWEEN 1990 AND 1999
   AND genre = 'romance';`
***

**Or**
***
`SELECT *
FROM movies
WHERE year > 2014
   OR genre = 'action';`
***

**Order By**
***
`SELECT *
FROM movies
ORDER BY name;`

`SELECT *
FROM movies
WHERE imdb_rating > 8
ORDER BY year DESC;`
***

**Limit**
***
`SELECT *
FROM movies
LIMIT 10`
***

**Case**
***
`SELECT name,
 CASE
  WHEN imdb_rating > 8 THEN 'Fantastic'
  WHEN imdb_rating > 6 THEN 'Poorly Received'
  ELSE 'Avoid at All Costs'
 END
FROM movies;`

`SELECT name,
 CASE
  WHEN imdb_rating > 8 THEN 'Fantastic'
  WHEN imdb_rating > 6 THEN 'Poorly Received'
  ELSE 'Avoid at All Costs'
 END AS 'Review'
FROM movies;`
***


### Functions

**MAX()**
***
`SELECT MAX(amount) 
FROM transactions;`
***

**MIN()**
***
`SELECT MIN(amount) 
FROM transactions;`
***

**SUM()**
***
`SELECT SUM(salary)
FROM salary_disbursement;`
***

**COUNT()**
***
`SELECT COUNT(*)
FROM employees
WHERE experience < 5;`
***

**AVG()**
***
`SELECT AVG(salary)
FROM employees
WHERE experience < 5;`
***

**ROUND()**
***
`SELECT year, 
   ROUND(AVG(rating), 2) 
FROM movies 
WHERE year = 2015;`
***

**Column Reference**
***
`SELECT COUNT(*) AS 'total_movies', 
   rating 
FROM movies 
GROUP BY 2 
ORDER BY 1;`
***

**Group by clause**
***
`SELECT rating, 
   COUNT(*) 
FROM movies 
GROUP BY rating;`
***

**Having by clause**
***
`SELECT year, 
   COUNT(*) 
FROM movies 
GROUP BY year
HAVING COUNT(*) > 5;`
***

### Multiple Tables

**Outer Join**
Combine rows from different tables even if the join condition is not met.
***
`SELECT column_name(s)
FROM table1
LEFT JOIN table2
  ON table1.column_name = table2.column_name;`
***

**Inner Join**
The return of results from more than one table by joining them together with other results based on common column values specified using an ON clause
***
`SELECT * 
FROM books
JOIN authors
  ON books.author_id = authors.id;`
***

**With Clause**
***
`WITH temporary_movies AS (
   SELECT *
   FROM movies
)
SELECT *
FROM temporary_movies
WHERE year BETWEEN 2000 AND 2020;`
***

**Union Clause**
Combine results that appear from multiple SELECT statements and filter duplicates.
***
`SELECT name
FROM first_names
UNION
SELECT name
FROM last_names`
***

**Cross Join Clause**
***
`SELECT shirts.shirt_color,
   pants.pants_color
FROM shirts
CROSS JOIN pants;`
***
