In [2]:
import os
import pandas as pd
import numpy as np
import random

In [3]:
path1 = os.getcwd()
path2 = 'roster.csv'
csvFilePath = os.path.join(path1, path2)

## Read CSV

In [4]:
roster = pd.read_csv(csvFilePath)
print(type(roster))

<class 'pandas.core.frame.DataFrame'>


In [5]:
roster.head()

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V


In [6]:
roster.tail()

Unnamed: 0,name
17,Hsin-Yun
18,Renata
19,Max
20,Joshua
21,David


In [7]:
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


## Modifying the Data

In [8]:
d = {'name': ['Wally']}
tmp_df = pd.DataFrame(data=d)
roster = pd.concat([roster, tmp_df], ignore_index=True)
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


## Asign Grades

In [9]:
np.random.seed(1)
grades = np.random.randint(0,100, size=len(roster))
print(grades)

[37 12 72  9 75  5 79 64 16  1 76 71  6 25 50 20 18 84 11 28 29 14 50]


In [10]:
roster['grade'] = np.random.randint(0,100, size=len(roster))
roster

Unnamed: 0,name,grade
0,Joe,68
1,Jihuan,87
2,Ali,87
3,Frances,94
4,Daniela V,96
5,Mostafa,86
6,Daniela P,13
7,Cesar,9
8,Jarrod,7
9,Austin,63


## Modify Specific row of data

.loc can be used with a boolean array (i.e. a an array of 1s and 0s)

In [11]:
roster.loc[roster.name == "Daniela P", "grade"] = 100
roster

Unnamed: 0,name,grade
0,Joe,68
1,Jihuan,87
2,Ali,87
3,Frances,94
4,Daniela V,96
5,Mostafa,86
6,Daniela P,100
7,Cesar,9
8,Jarrod,7
9,Austin,63


# Check the Class Average

In [12]:
roster['grade'].mean()

53.78260869565217

In [13]:
roster.loc[roster['grade'] < 69, 'grade'] = roster['grade'] + 30
roster['grade'].mean()

72.04347826086956

# Write to CSV

In [14]:
outFilePath = os.path.join(os.getcwd(),'roster_pandas.csv')
print(outFilePath)

/Users/pengshen/HelloWorld/roster_pandas.csv


In [15]:
roster.to_csv(outFilePath, index = False)

### More Aggregation and Manipulation

In [16]:
np.random.choice(['red','blue'], size = len(roster))

array(['blue', 'blue', 'red', 'blue', 'blue', 'blue', 'blue', 'red',
       'red', 'blue', 'blue', 'red', 'red', 'red', 'red', 'blue', 'blue',
       'blue', 'red', 'blue', 'red', 'red', 'blue'], dtype='<U4')

In [17]:
np.random.seed(1)
roster['group'] = np.random.choice(['red','blue'], size=len(roster))
roster

Unnamed: 0,name,grade,group
0,Joe,98,blue
1,Jihuan,87,blue
2,Ali,87,red
3,Frances,94,red
4,Daniela V,96,blue
5,Mostafa,86,blue
6,Daniela P,100,blue
7,Cesar,39,blue
8,Jarrod,37,blue
9,Austin,93,red


In [18]:
group_means = roster.groupby(by=['group']).mean()
group_means

Unnamed: 0_level_0,grade
group,Unnamed: 1_level_1
blue,63.833333
red,81.0


In [19]:
group_means.rename(columns={'grade':'group_avg'}, inplace = True)

In [20]:
group_means


Unnamed: 0_level_0,group_avg
group,Unnamed: 1_level_1
blue,63.833333
red,81.0


# Merging DataFrames

In [21]:
print(roster.shape)
print(group_means.shape)

(23, 3)
(2, 1)


In [22]:
roster = roster.merge(group_means, on=['group'])

roster.shape

In [23]:
roster

Unnamed: 0,name,grade,group,group_avg
0,Joe,98,blue,63.833333
1,Jihuan,87,blue,63.833333
2,Daniela V,96,blue,63.833333
3,Mostafa,86,blue,63.833333
4,Daniela P,100,blue,63.833333
5,Cesar,39,blue,63.833333
6,Jarrod,37,blue,63.833333
7,Ala,52,blue,63.833333
8,Miles,31,blue,63.833333
9,Hyeyun,30,blue,63.833333


### Creating new columns from custom functions

In [24]:
def is_top50(col):
    return col > col.median()


In [25]:
roster['top50'] = roster[['grade']].apply(is_top50)

In [26]:
roster

Unnamed: 0,name,grade,group,group_avg,top50
0,Joe,98,blue,63.833333,True
1,Jihuan,87,blue,63.833333,True
2,Daniela V,96,blue,63.833333,True
3,Mostafa,86,blue,63.833333,False
4,Daniela P,100,blue,63.833333,True
5,Cesar,39,blue,63.833333,False
6,Jarrod,37,blue,63.833333,False
7,Ala,52,blue,63.833333,False
8,Miles,31,blue,63.833333,False
9,Hyeyun,30,blue,63.833333,False


In [27]:
roster['grade']

0      98
1      87
2      96
3      86
4     100
5      39
6      37
7      52
8      31
9      30
10     38
11     72
12     87
13     94
14     93
15     91
16     87
17     90
18     81
19     88
20     43
21     77
22     60
Name: grade, dtype: int64

In [28]:
roster[['grade']]

Unnamed: 0,grade
0,98
1,87
2,96
3,86
4,100
5,39
6,37
7,52
8,31
9,30


# Creating new columns from custom functions


In [29]:
roster['top50_group'] = roster.groupby(by=['group'])[['grade']].apply(is_top50)
roster

Unnamed: 0,name,grade,group,group_avg,top50,top50_group
0,Joe,98,blue,63.833333,True,True
1,Jihuan,87,blue,63.833333,True,True
2,Daniela V,96,blue,63.833333,True,True
3,Mostafa,86,blue,63.833333,False,True
4,Daniela P,100,blue,63.833333,True,True
5,Cesar,39,blue,63.833333,False,False
6,Jarrod,37,blue,63.833333,False,False
7,Ala,52,blue,63.833333,False,False
8,Miles,31,blue,63.833333,False,False
9,Hyeyun,30,blue,63.833333,False,False


## Apply

# pandas.Series.apply

In [30]:
grade_series = roster['grade']
print(type(grade_series))
grade_series

<class 'pandas.core.series.Series'>


0      98
1      87
2      96
3      86
4     100
5      39
6      37
7      52
8      31
9      30
10     38
11     72
12     87
13     94
14     93
15     91
16     87
17     90
18     81
19     88
20     43
21     77
22     60
Name: grade, dtype: int64

In [31]:
def print_arg(x):
    print(x)

def print_type(x):
    print(type(x))
    

In [32]:
grade_series.apply(print_arg)

98
87
96
86
100
39
37
52
31
30
38
72
87
94
93
91
87
90
81
88
43
77
60


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
Name: grade, dtype: object

# pandas.DataFrame.apply

In [33]:
grade_df = roster[['grade']]
print(type(grade_df))
grade_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,grade
0,98
1,87
2,96
3,86
4,100
5,39
6,37
7,52
8,31
9,30


In [35]:
grade_df.apply(print_type, axis=0)

<class 'pandas.core.series.Series'>


grade    None
dtype: object

In [36]:
grade_df.apply(print_arg, axis=1)

grade    98
Name: 0, dtype: int64
grade    87
Name: 1, dtype: int64
grade    96
Name: 2, dtype: int64
grade    86
Name: 3, dtype: int64
grade    100
Name: 4, dtype: int64
grade    39
Name: 5, dtype: int64
grade    37
Name: 6, dtype: int64
grade    52
Name: 7, dtype: int64
grade    31
Name: 8, dtype: int64
grade    30
Name: 9, dtype: int64
grade    38
Name: 10, dtype: int64
grade    72
Name: 11, dtype: int64
grade    87
Name: 12, dtype: int64
grade    94
Name: 13, dtype: int64
grade    93
Name: 14, dtype: int64
grade    91
Name: 15, dtype: int64
grade    87
Name: 16, dtype: int64
grade    90
Name: 17, dtype: int64
grade    81
Name: 18, dtype: int64
grade    88
Name: 19, dtype: int64
grade    43
Name: 20, dtype: int64
grade    77
Name: 21, dtype: int64
grade    60
Name: 22, dtype: int64


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
dtype: object

# DataFrameGroupBy

In [37]:
groups = roster.groupby(by=['group'])
print(type(groups))
groups

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb6a53919a0>

In [38]:
groups.apply(print_arg)

         name  grade group  group_avg  top50  top50_group
0         Joe     98  blue  63.833333   True         True
1      Jihuan     87  blue  63.833333   True         True
2   Daniela V     96  blue  63.833333   True         True
3     Mostafa     86  blue  63.833333  False         True
4   Daniela P    100  blue  63.833333   True         True
5       Cesar     39  blue  63.833333  False        False
6      Jarrod     37  blue  63.833333  False        False
7         Ala     52  blue  63.833333  False        False
8       Miles     31  blue  63.833333  False        False
9      Hyeyun     30  blue  63.833333  False        False
10   Hsin-Yun     38  blue  63.833333  False        False
11      David     72  blue  63.833333  False         True
         name  grade group  group_avg  top50  top50_group
12        Ali     87   red       81.0   True        False
13    Frances     94   red       81.0   True         True
14     Austin     93   red       81.0   True         True
15       Jack 

In [39]:
groups.apply(print_type)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [40]:
groups1 = roster.groupby('group')
print(type(groups1))
groups1

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb6a52f3ac0>

In [41]:
groups1.apply(print_arg)

         name  grade group  group_avg  top50  top50_group
0         Joe     98  blue  63.833333   True         True
1      Jihuan     87  blue  63.833333   True         True
2   Daniela V     96  blue  63.833333   True         True
3     Mostafa     86  blue  63.833333  False         True
4   Daniela P    100  blue  63.833333   True         True
5       Cesar     39  blue  63.833333  False        False
6      Jarrod     37  blue  63.833333  False        False
7         Ala     52  blue  63.833333  False        False
8       Miles     31  blue  63.833333  False        False
9      Hyeyun     30  blue  63.833333  False        False
10   Hsin-Yun     38  blue  63.833333  False        False
11      David     72  blue  63.833333  False         True
         name  grade group  group_avg  top50  top50_group
12        Ali     87   red       81.0   True        False
13    Frances     94   red       81.0   True         True
14     Austin     93   red       81.0   True         True
15       Jack 

In [42]:
groups1.apply(print_type)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
