## Exercises

In [1]:
import pandas as pd
import numpy as np

### 1. Attendance Data
Load the attendance.csv file and calculate an attendace percentage for each student.

One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

You should end up with this:

| student | score  |
| ------- | -----  |
| Billy   | 0.2250 |
| Jane    | 0.1875 |
| John    | 0.7125 |
| Sally   | 0.3625 |

In [2]:
att = pd.read_csv('untidy_data/attendance.csv')

In [3]:
att.head(2)

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T


In [4]:
att = att.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='score')

In [5]:
att.head(2)

Unnamed: 0.1,Unnamed: 0,date,score
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A


In [6]:
att.rename(columns={'Unnamed: 0':'student'}, inplace=True)

In [7]:
att.head(2)

Unnamed: 0,student,date,score
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A


In [8]:
att.loc[att.score == 'P', 'score'] = 1.0
att.loc[att.score == 'H', 'score'] = 0.5
att.loc[att.score == 'T', 'score'] = 0.9
att.loc[att.score == 'A', 'score'] = 0.0

In [9]:
att.head(2)

Unnamed: 0,student,date,score
0,Sally,2018-01-01,1
1,Jane,2018-01-01,0


In [10]:
att.columns

Index(['student', 'date', 'score'], dtype='object')

In [11]:
att.score = att.score.astype(float)

In [12]:
att.head(2)

Unnamed: 0,student,date,score
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0


In [13]:
att = att.groupby(['student']).score.agg('mean')

In [14]:
att

student
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: score, dtype: float64

### 2. Coffee Levels

    a. Read the coffee_levels.csv file.
    
    b. Transform the data so that each carafe is in it's own column.
    
    c. Is this the best shape for the data?

In [15]:
stu_coffee = pd.read_csv('coffee_levels.csv')

In [16]:
stu_coffee.sample(10)

Unnamed: 0,hour,coffee_carafe,coffee_amount
11,9,y,0.521502
29,17,z,0.436677
3,11,x,0.335533
15,13,y,0.997464
10,8,y,0.189297
26,14,z,0.864464
8,16,x,0.183891
24,12,z,0.771947
21,9,z,0.91599
19,17,y,0.594126


In [17]:
stu_coffee.pivot_table(index='hour', columns='coffee_carafe')
# stu_coffee.pivot_table('coffee_amount', 'hour', 'coffee_carafe')

Unnamed: 0_level_0,coffee_amount,coffee_amount,coffee_amount
coffee_carafe,x,y,z
hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


- since all the units of measurement appear to be the same, don't split it out into separate columns, and instead just leave in one column

### 3. Cake Recipes

    a. Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
    
    b. Tidy the data as necessary.
    
    c. Which recipe, on average, is the best? recipe b
    
    d. Which oven temperature, on average, produces the best results? 275
    
    e. Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [18]:
cake = pd.read_csv('cake_recipes.csv')

In [19]:
cake.head(2)

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847


In [20]:
cake.columns

Index(['recipe:position', '225', '250', '275', '300'], dtype='object')

In [21]:
cake = cake.melt(id_vars=['recipe:position'], var_name = 'temp', value_name='tastiness')

In [22]:
cake.head()

Unnamed: 0,recipe:position,temp,tastiness
0,a:bottom,225,61.738655
1,a:top,225,51.709751
2,b:bottom,225,57.09532
3,b:top,225,82.455004
4,c:bottom,225,96.470207


In [23]:
cols = cake['recipe:position'].str.split(':', expand=True)

In [24]:
cols.columns = ['recipe','position']

In [25]:
cols

Unnamed: 0,recipe,position
0,a,bottom
1,a,top
2,b,bottom
3,b,top
4,c,bottom
5,c,top
6,d,bottom
7,d,top
8,a,bottom
9,a,top


In [26]:
cake = pd.concat([cake.drop(columns='recipe:position'), cols],axis=1)

In [27]:
cake.head()

Unnamed: 0,temp,tastiness,recipe,position
0,225,61.738655,a,bottom
1,225,51.709751,a,top
2,225,57.09532,b,bottom
3,225,82.455004,b,top
4,225,96.470207,c,bottom


In [28]:
cake.groupby('recipe').tastiness.mean()

recipe
a    63.922201
b    76.736074
c    75.874748
d    62.864844
Name: tastiness, dtype: float64

In [30]:
cake.groupby('temp').tastiness.mean()

temp
225    71.306022
250    66.577437
275    74.886754
300    66.627655
Name: tastiness, dtype: float64

In [29]:
cake.groupby('position').tastiness.mean()

position
bottom    68.408553
top       71.290381
Name: tastiness, dtype: float64

In [32]:
cake.sort_values(by='tastiness', ascending=True).tail(1)

Unnamed: 0,temp,tastiness,recipe,position
26,300,99.248541,b,bottom
