In [1]:
import numpy as np
import pandas as pd

In [2]:
# import the external 'csv' file
trials = pd.read_csv('trials_01.csv')

In [3]:
print(trials)

   id treatment gender  response
0   1         A      F         5
1   2         A      M         3
2   3         B      F         8
3   4         B      M         9


In [4]:
# check the index range
trials.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# index the specific column via 'column name'
trials['treatment']

0    A
1    A
2    B
3    B
Name: treatment, dtype: object

In [6]:
# customized to reshape the data
# index for row label 
# columns for column name
# values for the interaction value with specific column values
trials.pivot(index = 'treatment', columns = 'gender', values = 'response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [9]:
trials.pivot_table(index = 'treatment', columns = 'gender', values = 'response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [10]:
print(trials.pivot(index = 'treatment', columns = 'gender', values = 'response'))

gender     F  M
treatment      
A          5  3
B          8  9


In [12]:
# if it ignore the 'values', it will return remaining column values  
tp = trials.pivot(index = 'treatment', columns = 'gender')
print(tp)

          id    response   
gender     F  M        F  M
treatment                  
A          1  2        5  3
B          3  4        8  9


In [14]:
print(tp.T)

treatment        A  B
         gender      
id       F       1  3
         M       2  4
response F       5  8
         M       3  9


In [15]:
tp.index

Index(['A', 'B'], dtype='object', name='treatment')

In [5]:
more_trials = pd.read_csv('trials_03.csv')
print(more_trials)

   id treatment gender  response
0   1         A      F         5
1   2         A      M         3
2   3         A      M         8
3   4         A      F         9
4   5         B      F         1
5   6         B      M         8
6   7         B      F         4
7   8         B      F         6


In [6]:
more_trials.pivot(index = 'treatment', columns = 'gender', values = 'response')

ValueError: Index contains duplicate entries, cannot reshape

In [18]:
# return some calculation table: just based on the 'aggfunc' 
more_trials.pivot_table(index = 'treatment', columns = 'gender', values = 'response', aggfunc = 'sum')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,14,11
B,11,8


In [24]:
# default 'aggfunc = mean'
print(more_trials.pivot_table(index = 'treatment', columns = 'gender', values = 'response', aggfunc = 'mean'))

print(more_trials.pivot_table(index = 'treatment', columns = 'gender', values = 'response'))

gender            F    M
treatment               
A          7.000000  5.5
B          3.666667  8.0
gender            F    M
treatment               
A          7.000000  5.5
B          3.666667  8.0


In [25]:
print(trials)

   id treatment gender  response
0   1         A      F         5
1   2         A      M         3
2   3         B      F         8
3   4         B      M         9


In [11]:
# using column values to be index the row label
# the original data index will not be affected
trials2 = trials.set_index(['gender'])
print(trials2)

        id treatment  response
gender                        
F        1         A         5
M        2         A         3
F        3         B         8
M        4         B         9


In [12]:
trials2.index

Index(['F', 'M', 'F', 'M'], dtype='object', name='gender')

In [21]:
# sort the data by index (the row sequence will be change) 
# sort for increasing
trials2 = trials2.sort_index(ascending=True) 
print(trials2)

        id treatment  response
gender                        
F        1         A         5
F        3         B         8
M        2         A         3
M        4         B         9


In [19]:
# row label sort for increasing
# print(trials.set_index(["id"]).sort_index( ascending=True)) or 
print(trials.set_index(["id"]).sort_index())
# row label sort for decreasing
print(trials.set_index(["id"]).sort_index(ascending=False))

   treatment gender  response
id                           
1          A      F         5
2          A      M         3
3          B      F         8
4          B      M         9
   treatment gender  response
id                           
4          B      M         9
3          B      F         8
2          A      M         3
1          A      F         5


# dataframe indices are immutable

In [22]:
trials2.index

Index(['F', 'F', 'M', 'M'], dtype='object', name='gender')

In [23]:
trials2.index[2]

'M'

In [44]:
trials2.index[2] = 'F'

TypeError: Index does not support mutable operations

In [24]:
# index row label with multiple column names, will be multiple level index 
trials3 = trials.set_index(['treatment', 'gender'])
print(trials3)

                  id  response
treatment gender              
A         F        1         5
          M        2         3
B         F        3         8
          M        4         9


In [47]:
# it is not useful 
trials5 = trials.set_index(['gender', 'treatment'])
print(trials5)

                  id  response
gender treatment              
F      A           1         5
M      A           2         3
F      B           3         8
M      B           4         9


In [48]:
print(trials3.index)

MultiIndex(levels=[['A', 'B'], ['F', 'M']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['treatment', 'gender'])


In [49]:
# exchange the row index label to be subcolumns name
print(trials3.unstack(level = 'treatment'))

          id    response   
treatment  A  B        A  B
gender                     
F          1  3        5  8
M          2  4        3  9


In [50]:
unstacked_trials = trials3.unstack(level = 'treatment')
print(unstacked_trials)

          id    response   
treatment  A  B        A  B
gender                     
F          1  3        5  8
M          2  4        3  9


In [53]:
# exchange the subcolumns name to become row index label again  
print(unstacked_trials.stack(level = "treatment"))

                  id  response
gender treatment              
F      A           1         5
       B           3         8
M      A           2         3
       B           4         9


In [54]:
print(trials3.unstack(level = 'gender'))

          id    response   
gender     F  M        F  M
treatment                  
A          1  2        5  3
B          3  4        8  9


In [56]:
print(unstacked_trials)

          id    response   
treatment  A  B        A  B
gender                     
F          1  3        5  8
M          2  4        3  9


In [57]:
print(unstacked_trials.index)

Index(['F', 'M'], dtype='object', name='gender')


In [59]:
# there are multiple column levels
print(unstacked_trials.columns)

MultiIndex(levels=[['id', 'response'], ['A', 'B']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=[None, 'treatment'])


In [64]:
print(trials3)

                  id  response
treatment gender              
A         F        1         5
          M        2         3
B         F        3         8
          M        4         9


In [61]:
trials3.index

MultiIndex(levels=[['A', 'B'], ['F', 'M']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['treatment', 'gender'])

In [67]:
# exchange the row label levels   
print(trials3.swaplevel().sort_index())

                  id  response
gender treatment              
F      A           1         5
       B           3         8
M      A           2         3
       B           4         9


In [66]:
trials3.swaplevel().sort_index().index

MultiIndex(levels=[['F', 'M'], ['A', 'B']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['gender', 'treatment'])

# test edit