# Aggregate Functions 1

In [1]:
import pandas as pd 
import sqlite3


def create_df(cursor: sqlite3.Cursor) -> pd.DataFrame:
    data = cursor.fetchall()
    column_names = [description[0] for description in cursor.description]
    df = pd.DataFrame(data, columns=column_names)
    return df


conn = sqlite3.connect(":memory:")

#### Prepare Database

In [2]:
st = """ 
    CREATE TABLE nomnom (
        name TEXT,
        location TEXT,
        category TEXT,
        employees INTEGER,
        raised INTEGER,
        valuation INTEGER,
        founded INTEGER,
        stage TEXT,
        ceo TEXT,
        info TEXT
    );
"""
conn.execute(st)

<sqlite3.Cursor at 0x1ca4eb93940>

In [10]:
df = pd.read_csv("startups.csv", delimiter=";")

name = df["name"].tolist()
location = df["location"].tolist()
category = df["category"].tolist()
employees = df["employees"].tolist()
raised = df["raised"].tolist()
valuation = df["valuation"].tolist()
founded = df["founded"].tolist()
stage = df["stage"].tolist()
ceo = df["ceo"].tolist()
info = df["info"].tolist()

In [None]:
conn.executemany(
    """INSERT INTO nomnom 
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """,
    zip(name, location, category, employees, raised, valuation, founded, stage, ceo, info)
)

<sqlite3.Cursor at 0x1ca506a9fc0>

#### 1.

Getting started, take a look at the `startups` table:

```sql
SELECT *
FROM startups;
```

How many columns are there?

In [12]:
st = """ 
    SELECT * 
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,name,location,category,employees,raised,valuation,founded,stage,ceo,info
0,Pied Piper,Silicon Valley,Cloud Computing,6.0,5000000.0,5.000000e+07,2014,A,Richard Hendricks,A Middle-Out Compression Solution
1,Hooli,Silicon Valley,Enterprise,9000.0,580000000.0,4.950000e+10,1997,,Gavin Bensen,Hooli Is About People
2,Raviga Capital,Silicon Valley,Venture Capital,12.0,300000000.0,3.000000e+09,2012,,Peter Gregory,Share Only In Success
3,Aviato,Silicon Valley,Travel,3.0,250000.0,2.500000e+06,2006,Acquired,Erlich Bachman,Software Aggregation Program
4,SEE FOOD,Silicon Valley,Mobile,2.0,,1.500000e+07,2016,Acquired,Jian-Yang,The Shazam of Food
...,...,...,...,...,...,...,...,...,...,...
65,YoBert,New York,Education,31.0,800000.0,4.000000e+07,2016,Acquired,Ernie Dzeda,Seasme Street Themed Educational App
66,Trufflez,New York,Mobile,23.0,1000000.0,1.200000e+07,2014,Seed,,AI Suggests Recipes Based on Photos
67,Unfade,New York,Fashion,4.0,400000.0,5.600000e+06,2016,A,Peggy Grimes,Everything Hair
68,OrangeYouLonely,Minneapolis,Social,20.0,1200000.0,1.400000e+07,2012,Seed,Gus Dawson,Dating App for Farmers


#### 2.

Calculate the total number of companies in the table.

In [21]:
st = """ 
    SELECT COUNT(*) AS "Number of startups"
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,Number of startups
0,70


#### 3.

We want to know the total value of all companies in this table.

Calculate this by getting the `SUM()` of the `valuation` column.

In [20]:
st = """ 
    SELECT SUM(valuation) AS 'Total Valuation'
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,Total Valuation
0,974455790000


#### 4.

What is the highest amount raised by a startup?

Return the maximum amount of money `raised`.

In [None]:
st = """ 
    SELECT MAX(raised) AS "Max Raised"
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,Max Raised
0,11500000000


#### 5.

Edit the query so that it returns the maximum amount of money `raised`, during ‘Seed’ stage.

In [None]:
st = """ 
    SELECT MAX(raised) AS "Max Raised"
    FROM nomnom
    WHERE stage = 'Seed';
"""

create_df(conn.execute(st))

Unnamed: 0,Max Raised
0,1800000


#### 6.

In what year was the oldest company on the list founded?

In [26]:
st = """ 
    SELECT MIN(founded) AS "Oldest Startup Founded"
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,Oldest Startup Founded
0,1994


#### 7.

Return the average `valuation`.

In [27]:
st = """ 
    SELECT AVG(valuation) AS "Average Valuation"
    FROM nomnom;
"""

create_df(conn.execute(st))

Unnamed: 0,Average Valuation
0,15974690000.0


#### 8.


Return the average `valuation`, in each `category`.

In [28]:
st = """ 
    SELECT category, AVG(valuation) AS "Average Valuation"
    FROM nomnom
    GROUP BY category;
"""

create_df(conn.execute(st))

Unnamed: 0,category,Average Valuation
0,,4290000.0
1,Algorithms,7600000.0
2,Augmented Reality,8000000000.0
3,Big Data Analytics,15000000.0
4,Cloud Computing,95000000.0
5,Customer Service,640000000.0
6,Data Analytics,
7,E-commerce,60250000.0
8,Education,2023800000.0
9,Enterprise,38508330000.0


#### 9.

Return the average `valuation`, in each `category`.

Round the averages to two decimal places.

In [29]:
st = """ 
    SELECT category, ROUND(AVG(valuation), 2) AS "Average Valuation"
    FROM nomnom
    GROUP BY category;
"""

create_df(conn.execute(st))

Unnamed: 0,category,Average Valuation
0,,4290000.0
1,Algorithms,7600000.0
2,Augmented Reality,8000000000.0
3,Big Data Analytics,15000000.0
4,Cloud Computing,95000000.0
5,Customer Service,640000000.0
6,Data Analytics,
7,E-commerce,60250000.0
8,Education,2023800000.0
9,Enterprise,38508330000.0


#### 10.

Return the average `valuation`, in each `category`.

Round the averages to two decimal places.

Lastly, order the list from highest averages to lowest.

In [32]:
st = """ 
    SELECT category, ROUND(AVG(valuation), 2) AS "Average Valuation"
    FROM nomnom
    GROUP BY 1
    ORDER BY 2 DESC;
"""

create_df(conn.execute(st))

Unnamed: 0,category,Average Valuation
0,Health Care,380490000000.0
1,Enterprise,38508330000.0
2,Real Estate,20000000000.0
3,Travel,12501250000.0
4,Augmented Reality,8000000000.0
5,Security,6333333000.0
6,Technology,3100000000.0
7,Venture Capital,3000000000.0
8,Education,2023800000.0
9,Customer Service,640000000.0


#### 11.


First, return the name of each `category` with the total number of companies that belong to it.

In [34]:
st = """ 
    SELECT category, COUNT(*) AS "Number of Startups"
    FROM nomnom
    GROUP BY 1;
"""

create_df(conn.execute(st))

Unnamed: 0,category,Number of Startups
0,,3
1,Algorithms,1
2,Augmented Reality,1
3,Big Data Analytics,1
4,Cloud Computing,2
5,Customer Service,1
6,Data Analytics,1
7,E-commerce,3
8,Education,5
9,Enterprise,3


#### 12.

Next, filter the result to only include categories that have more than three companies in them.

What are the most competitive markets?

In [35]:
st = """ 
    SELECT category, COUNT(*) AS "Number of Startups"
    FROM nomnom
    GROUP BY 1
    HAVING COUNT(*) > 3;
"""

create_df(conn.execute(st))

Unnamed: 0,category,Number of Startups
0,Education,5
1,Mobile,10
2,Social,12


#### 13.

What is the average size of a startup in each `location`?

In [37]:
st = """ 
    SELECT location, AVG(employees) AS "Average Employees"
    FROM nomnom
    GROUP BY 1;
"""

create_df(conn.execute(st))

Unnamed: 0,location,Average Employees
0,Atlanta,3.0
1,Boulder,3.0
2,Brooklyn,502.666667
3,Chicago,12.0
4,Columbus,2.0
5,Denver,12.0
6,Fort Lauderdale,500.0
7,Irvine,2.0
8,Long Island,5.0
9,Los Angeles,6.833333


#### 14.

What is the average size of a startup in each `location`, with average sizes above 500?

In [38]:
st = """ 
    SELECT location, AVG(employees) AS "Average Employees"
    FROM nomnom
    GROUP BY 1
    HAVING AVG(employees) > 500;
"""

create_df(conn.execute(st))

Unnamed: 0,location,Average Employees
0,Brooklyn,502.666667
1,New York,702.75
2,San Francisco,1920.4
3,Silicon Valley,1804.6
