# Part 1 -- Intro

Load the tips dataset from seaborn

In [1]:
import seaborn as sns

tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Filter rows by `smoker == 'No'` and `total_bill >= 10`

In [6]:
tips[(tips['smoker'] == 'No') & (tips['total_bill'] >=10)]

#or

tips.loc[(tips['smoker'] == 'No') & (tips['total_bill'] >= 10)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2


### What is the average `total_bill` for each value of `smoker`, `day`, and `time`

In [7]:
tips.groupby(['smoker','day','time'])['total_bill'].mean()

smoker  day   time  
Yes     Thur  Lunch     19.190588
        Fri   Lunch     12.323333
              Dinner    19.806667
        Sat   Dinner    21.276667
        Sun   Dinner    24.120000
No      Thur  Lunch     17.075227
              Dinner    18.780000
        Fri   Lunch     15.980000
              Dinner    19.233333
        Sat   Dinner    19.661778
        Sun   Dinner    20.506667
Name: total_bill, dtype: float64

In [28]:
tips.drop('smoker',1)

Unnamed: 0,total_bill,tip,sex,day,time,size
0,16.99,1.01,Female,Sun,Dinner,2
1,10.34,1.66,Male,Sun,Dinner,3
2,21.01,3.50,Male,Sun,Dinner,3
3,23.68,3.31,Male,Sun,Dinner,2
4,24.59,3.61,Female,Sun,Dinner,4
5,25.29,4.71,Male,Sun,Dinner,4
6,8.77,2.00,Male,Sun,Dinner,2
7,26.88,3.12,Male,Sun,Dinner,4
8,15.04,1.96,Male,Sun,Dinner,2
9,14.78,3.23,Male,Sun,Dinner,2


# Part 2 -- Tidy

Taken from the r4ds "Tidy Data" Chapter: https://r4ds.had.co.nz/exploratory-data-analysis.html

In [8]:
import pandas as pd
tbl1 = pd.read_csv('../data/table1.csv')
tbl2 = pd.read_csv('../data/table2.csv')
tbl3 = pd.read_csv('../data/table3.csv')

In [10]:
tbl1.head()

Unnamed: 0,country,year,cases,population
0,Afghanistan,1999,745,19987071
1,Afghanistan,2000,2666,20595360
2,Brazil,1999,37737,172006362
3,Brazil,2000,80488,174504898
4,China,1999,212258,1272915272


In [11]:
tbl2.head() 

Unnamed: 0,country,year,type,count
0,Afghanistan,1999,cases,745
1,Afghanistan,1999,population,19987071
2,Afghanistan,2000,cases,2666
3,Afghanistan,2000,population,20595360
4,Brazil,1999,cases,37737


In [12]:
tbl3.head()

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272


### Tidy the `tbl2` dataset

In [13]:
tbl2.head()

Unnamed: 0,country,year,type,count
0,Afghanistan,1999,cases,745
1,Afghanistan,1999,population,19987071
2,Afghanistan,2000,cases,2666
3,Afghanistan,2000,population,20595360
4,Brazil,1999,cases,37737


In [16]:
tbl2.pivot_table(index=['country','year'],columns='type',values='count').reset_index()

type,country,year,cases,population
0,Afghanistan,1999,745,19987071
1,Afghanistan,2000,2666,20595360
2,Brazil,1999,37737,172006362
3,Brazil,2000,80488,174504898
4,China,1999,212258,1272915272
5,China,2000,213766,1280428583


### Tidy the `tbl3` dataset

In [17]:
tbl3.head()

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272


In [21]:
tbl3['population'] = tbl3['rate'].str.split("/",expand=True).get(1)

In [22]:
tbl3.head()

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272


# Part 3 -- Apply functions

Look at the `table3` dataset

In [29]:
tbl3.head()

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272


### Write a function that takes a value of `rate` and parses out the total population.

In [30]:
def getPop(rate):
    return rate.split('/')[1]

In [31]:
tbl3['rate'].apply(getPop)

0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: object

### Set the population to a new column

In [32]:
tbl3['population'] = tbl3['rate'].apply(getPop)

In [33]:
tbl3.head()

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272


# Part 4 -- Plots

### Create a figure with 2 axes
### distplot of `fare` in one axes
### boxplot of `class` and `fare` on the other axes

# Part 5 -- Models

### Subset `survived`, `class`, `who`

### Create dummy encoded dataset

### Fit a logistic regression on `survived`