# Dataframes
- Represent tabulat, 2-dimensional data
- Made of pandas series
- Additional functionality

In [1]:
import pandas as pd

In [2]:
books = [
    {
        "title": "Genetic Algorithms and Machine Learning for Programmers",
        "price": 36.99,
        "author": "Frances Buontempo"
    },
    {
        "title": "The Visual Display of Quantitative Information",
        "price": 38.00,
        "author": "Edward Tufte"
    },
    {
        "title": "Practical Object-Oriented Design",
        "author": "Sandi Metz",
        "price": 30.47
    },
    {
        "title": "Weapons of Math Destruction",
        "author": "Cathy O'Neil",
        "price": 17.44
    }
]

In [4]:
books = pd.DataFrame(books)
books

Unnamed: 0,title,price,author
0,Genetic Algorithms and Machine Learning for Pr...,36.99,Frances Buontempo
1,The Visual Display of Quantitative Information,38.0,Edward Tufte
2,Practical Object-Oriented Design,30.47,Sandi Metz
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [5]:
len(books)

4

In [6]:
#Shape method returns tuple of row and column
books.shape

(4, 3)

In [None]:
# .something = method
# something() = function

In [7]:
# .size returns the number of cells: rows x columns - counts null values
books.size


12

In [8]:
books.price.sum()

122.9

In [9]:
books.price

0    36.99
1    38.00
2    30.47
3    17.44
Name: price, dtype: float64

In [11]:
type(books.price)

pandas.core.series.Series

In [13]:
#Bracket notation
#Works if there is a space in the column name
#Best to use snake case - lowercase_with_underscores
books["price"].sum()

122.9

In [14]:
books.price.mean()

30.725

In [15]:
books.price.describe()

count     4.000000
mean     30.725000
std       9.464532
min      17.440000
25%      27.212500
50%      33.730000
75%      37.242500
max      38.000000
Name: price, dtype: float64

In [16]:
#boolean mask to return first row
books[books.index == 0]

Unnamed: 0,title,price,author
0,Genetic Algorithms and Machine Learning for Pr...,36.99,Frances Buontempo


In [18]:
books['price'][books.index ==0]

0    36.99
Name: price, dtype: float64

In [19]:
#pull specufuc record
books.iloc[0]

title     Genetic Algorithms and Machine Learning for Pr...
price                                                 36.99
author                                    Frances Buontempo
Name: 0, dtype: object

In [20]:
books.price.idxmax()


1

In [23]:
books[books.index == books.price.idxmax()]


Unnamed: 0,title,price,author
1,The Visual Display of Quantitative Information,38.0,Edward Tufte


In [24]:
# useful piece of info - save as variable

highest_price_index = books.price.idxmax()


In [25]:
books.iloc[highest_price_index]

title     The Visual Display of Quantitative Information
price                                               38.0
author                                      Edward Tufte
Name: 1, dtype: object

In [26]:
books[books.index == books.price.idxmin()]

Unnamed: 0,title,price,author
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [28]:
books[books.index >= 2]

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [29]:
books[books.author == 'Sandi Metz']

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz


In [31]:
books[books.author.str.contains("San")]

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz


In [32]:
x = pd.Series(range(1,11))
x >=5

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [33]:
x[x>5]

5     6
6     7
7     8
8     9
9    10
dtype: int64

In [34]:
import pandas as pd
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [35]:
print(df)

       name  math  english  reading
0     Sally    62       85       80
1      Jane    88       79       67
2     Suzie    94       74       95
3     Billy    98       96       88
4       Ada    77       92       98
5      John    79       76       93
6    Thomas    82       64       81
7     Marie    93       63       90
8    Albert    92       62       87
9   Richard    69       80       94
10    Isaac    92       99       93
11     Alan    92       62       72


In [36]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [37]:
df[df.name == "Jane"]

Unnamed: 0,name,math,english,reading
1,Jane,88,79,67


In [38]:
df[df.name == "Albert"].math

8    92
Name: math, dtype: int64

## Summarizing Dataframes

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [41]:
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


In [42]:
df.describe().index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

## Dataframe Attributes

- `dtype` How values stored in memory. Certain functions work with certain types
- `shape` rows and columns
- `columns` list of column names
- `index` labels for each row

In [44]:
df.dtypes


name       object
math        int64
english     int64
reading     int64
dtype: object

In [45]:
df.shape

(12, 4)

In [46]:
df.shape[0]

12

In [47]:
df.shape[1]

4

In [48]:
df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [49]:
df.index

RangeIndex(start=0, stop=12, step=1)

In [50]:
#can change
df.columns = [col.upper() for col in df.columns]

In [51]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


## Subsetting DataFrame
- Number or ways to access certain subsets - like a query

### Accessing Multiple Columns
- Can see multiple columns with list of strings

In [53]:
# Double square brackets or df[list_variable]
# Returns only columns specified
df[['NAME', 'MATH']]

Unnamed: 0,NAME,MATH
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [54]:
name_math_cols =['NAME','MATH']
df[name_math_cols]

Unnamed: 0,NAME,MATH
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [55]:
df.head()

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [56]:
math_df=df[name_math_cols]

In [57]:
math_df.head(3)

Unnamed: 0,NAME,MATH
0,Sally,62
1,Jane,88
2,Suzie,94


In [59]:
df.MATH

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: MATH, dtype: int64

In [60]:
df['MATH']

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: MATH, dtype: int64

### Accessing Row Subsets
- Pandas provides several ways 
    - `.head()`
    - `.tail()`
    - `.sample()`

In [61]:
df.tail(3)

Unnamed: 0,NAME,MATH,ENGLISH,READING
9,Richard,69,80,94
10,Isaac,92,99,93
11,Alan,92,62,72


In [62]:
df.sample(4)

Unnamed: 0,NAME,MATH,ENGLISH,READING
7,Marie,93,63,90
11,Alan,92,62,72
1,Jane,88,79,67
4,Ada,77,92,98


Dataframes can be indexed into with a bookean series

In [64]:
df[df.MATH <80]

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [65]:
# Pipe for OR |
#df[(mask1) | (mask2)]

df[(df.MATH <80)| (df.READING >90)]

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
2,Suzie,94,74,95
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94
10,Isaac,92,99,93


In [66]:
# & for AND
#df[(mask1) & (mask2)]
df[(df.MATH < 80) & (df.READING > 90)]

Unnamed: 0,NAME,MATH,ENGLISH,READING
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [67]:
#Can use variables for masks
math_less_than_80 = df.MATH <80
reading_greater_than_90 = df.READING >90
df[math_less_than_80 & reading_greater_than_90]

Unnamed: 0,NAME,MATH,ENGLISH,READING
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


## Dropping and Renaming Columns

- Can drop columns with `.drop` method
- Rename with `.rename`

- Original dataframe **will not be changed** 
    - Method produces new dataframe

In [68]:
#copy original

copy = df.copy()

In [84]:
copy

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [86]:
copy=df.copy()

In [87]:
copy.drop(columns = ['ENGLISH', 'READING'])

Unnamed: 0,NAME,MATH
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [88]:
copy.drop(columns = 'MATH', inplace = True)

In [90]:
df


Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [93]:
#df.rename returns a copy
dfr = df.rename(columns={'NAME': 'Student', 'MATH': 'Mathematics', 'ENGLISH': 'English', 'READING': 'Reading'})
#Need to reassign or use inplace=True

In [94]:
dfr

Unnamed: 0,Student,Mathematics,English,Reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


## Can chain methods

## Creating New Columns

- We can add new columns to our dataframe
    - Odten based on contents of another column

In [95]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [96]:
df.MATH >=70

0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9     False
10     True
11     True
Name: MATH, dtype: bool

In [98]:
df['passing_math'] = df.MATH > 70

In [99]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math
0,Sally,62,85,80,False
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True
8,Albert,92,62,87,True
9,Richard,69,80,94,False


In [102]:
df.passing_math.sum()

10

In [104]:
df.passing_math.mean()

0.8333333333333334

In [105]:
df.assign(passing_english=df.ENGLISH >=70)

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,passing_english
0,Sally,62,85,80,False,True
1,Jane,88,79,67,True,True
2,Suzie,94,74,95,True,True
3,Billy,98,96,88,True,True
4,Ada,77,92,98,True,True
5,John,79,76,93,True,True
6,Thomas,82,64,81,True,False
7,Marie,93,63,90,True,False
8,Albert,92,62,87,True,False
9,Richard,69,80,94,False,True


In [107]:
df["School"] = "SAHS"

In [108]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
0,Sally,62,85,80,False,SAHS
1,Jane,88,79,67,True,SAHS
2,Suzie,94,74,95,True,SAHS
3,Billy,98,96,88,True,SAHS
4,Ada,77,92,98,True,SAHS
5,John,79,76,93,True,SAHS
6,Thomas,82,64,81,True,SAHS
7,Marie,93,63,90,True,SAHS
8,Albert,92,62,87,True,SAHS
9,Richard,69,80,94,False,SAHS


## Sorting Dataframes

- Can use `.sort_values`

In [110]:
df.sort_values(by="ENGLISH", ascending=False)

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
10,Isaac,92,99,93,True,SAHS
3,Billy,98,96,88,True,SAHS
4,Ada,77,92,98,True,SAHS
0,Sally,62,85,80,False,SAHS
9,Richard,69,80,94,False,SAHS
1,Jane,88,79,67,True,SAHS
5,John,79,76,93,True,SAHS
2,Suzie,94,74,95,True,SAHS
6,Thomas,82,64,81,True,SAHS
7,Marie,93,63,90,True,SAHS


## Chaining Dataframe Methods

In [111]:
#reads from left to right
"hello".capitalize().swapcase()

'hELLO'

In [116]:
#Name of the student with the *lowest* english grade above a 90
df[df.ENGLISH > 90].sort_values(by='ENGLISH').head(1).NAME

4    Ada
Name: NAME, dtype: object

In [119]:
df[df.ENGLISH > 90]

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
3,Billy,98,96,88,True,SAHS
4,Ada,77,92,98,True,SAHS
10,Isaac,92,99,93,True,SAHS


In [120]:
df[df.ENGLISH > 90].sort_values(by="ENGLISH")

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
4,Ada,77,92,98,True,SAHS
3,Billy,98,96,88,True,SAHS
10,Isaac,92,99,93,True,SAHS


In [121]:
df[df.ENGLISH > 90].sort_values(by="ENGLISH").head(1)

Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
4,Ada,77,92,98,True,SAHS


In [123]:
df[df.ENGLISH > 90].sort_values(by="ENGLISH").head(1).NAME

4    Ada
Name: NAME, dtype: object

In [113]:
df


Unnamed: 0,NAME,MATH,ENGLISH,READING,passing_math,School
0,Sally,62,85,80,False,SAHS
1,Jane,88,79,67,True,SAHS
2,Suzie,94,74,95,True,SAHS
3,Billy,98,96,88,True,SAHS
4,Ada,77,92,98,True,SAHS
5,John,79,76,93,True,SAHS
6,Thomas,82,64,81,True,SAHS
7,Marie,93,63,90,True,SAHS
8,Albert,92,62,87,True,SAHS
9,Richard,69,80,94,False,SAHS
