In [1]:
# The pandas tutorials are really good:
# http://pandas.pydata.org/pandas-docs/stable/tutorials.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the data
sleep = pd.read_csv('http://lignos.org/pybootcamp/examples/sleep_study.csv')

In [3]:
# Columns in the data frame
sleep.columns

Index(['Reaction', 'Days', 'Subject'], dtype='object')

In [4]:
# Number of observations
len(sleep)

180

In [5]:
# A summary of the data frame. Notice that Subject is being treated as a number
sleep.describe()

Unnamed: 0,Reaction,Days,Subject
count,180.0,180.0,180.0
mean,298.507892,4.5,341.277778
std,56.328757,2.880293,20.320905
min,194.3322,0.0,308.0
25%,255.375825,2.0,331.0
50%,288.6508,4.5,336.0
75%,336.752075,7.0,352.0
max,466.3535,9.0,372.0


In [6]:
# Change the type of Subject
sleep.Subject = sleep.Subject.astype(str)

In [7]:
# Adding another column. Note that you have to use dictionary syntax here,
# sleep.LogReaction = ... won't work if the column doesn't already exist
sleep['LogReaction'] = np.log10(sleep.Reaction)
sleep.LogReaction.head()

0    2.397175
1    2.412804
2    2.399329
3    2.507100
4    2.552488
Name: LogReaction, dtype: float64

In [8]:
# Drop the LogReaction column. Note the weird syntax.
del(sleep['LogReaction'])

In [9]:
# Use head to see the top of the data
sleep.head()

Unnamed: 0,Reaction,Days,Subject
0,249.56,0,308
1,258.7047,1,308
2,250.8006,2,308
3,321.4398,3,308
4,356.8519,4,308


In [10]:
# Also works for individual columns. Note the dot syntax for columns.
sleep.Reaction.head()

0    249.5600
1    258.7047
2    250.8006
3    321.4398
4    356.8519
Name: Reaction, dtype: float64

In [11]:
# Another way to get a column, useful for weird column names.
sleep['Reaction'].head()

0    249.5600
1    258.7047
2    250.8006
3    321.4398
4    356.8519
Name: Reaction, dtype: float64

In [12]:
# You can apply a lot of standard methods to the columns themselves
sleep.Reaction.mean()

298.50789166666664

In [13]:
# Unique values
sleep.Subject.unique()

array(['308', '309', '310', '330', '331', '332', '333', '334', '335',
       '337', '349', '350', '351', '352', '369', '370', '371', '372'], dtype=object)

In [14]:
# Number of unique subjects
sleep.Subject.nunique()

18

In [15]:
# Sort data. Note how to control the sort direction.
sleep.sort_values('Reaction', ascending=False)

Unnamed: 0,Reaction,Days,Subject
9,466.3535,9,308
99,458.9167,9,337
98,455.8643,8,337
56,454.1619,6,332
8,430.5853,8,308
97,416.6923,7,337
5,414.6901,5,308
96,404.2601,6,337
118,394.4872,8,350
95,391.8385,5,337


In [16]:
# Get a single subject's data
subj = sleep[sleep.Subject == '308']

In [17]:
# Summarize the subject's data
subj.describe()

Unnamed: 0,Reaction,Days
count,10.0,10.0
mean,342.13383,4.5
std,79.821763,3.02765
min,249.56,0.0
25%,266.565675,2.25
50%,339.14585,4.5
75%,406.568525,6.75
max,466.3535,9.0


In [18]:
# You can group data frames by particular things
sleep_subjects = sleep.groupby('Subject')

In [19]:
# Note the number of rows is the number of subjects
len(sleep_subjects)

18

In [20]:
# Per subject reaction time means
sleep_subjects.mean()

Unnamed: 0_level_0,Reaction,Days
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1
308,342.13383,4.5
309,215.23298,4.5
310,231.00127,4.5
330,303.22142,4.5
331,309.43605,4.5
332,307.30207,4.5
333,316.15831,4.5
334,295.30205,4.5
335,250.07004,4.5
337,375.72101,4.5


In [21]:
# Per day reaction time means
sleep_days = sleep.groupby('Days')
sleep_days.mean()

Unnamed: 0_level_0,Reaction
Days,Unnamed: 1_level_1
0,256.651806
1,264.495756
2,265.3619
3,282.992011
4,288.649422
5,308.518456
6,312.178256
7,318.750583
8,336.629506
9,350.851222


In [22]:
# Standard deviation by day
sleep_days.std()

Unnamed: 0_level_0,Reaction
Days,Unnamed: 1_level_1
0,32.129451
1,33.430334
2,29.473423
3,38.857738
4,42.537887
5,51.769625
6,63.17372
7,50.103963
8,60.199716
9,66.986155
