### Get started with _DataFrame_ !

#### Section 2.1

In [4]:
# example 1

import pandas as pd

record1 = pd.Series({'Name': 'Alice',
                     'Class': 'Physics',
                     'Score': 85})

record2 = pd.Series({'Name': 'Jack',
                     'Class': 'Chemistry',
                     'Score': 82})

record3 = pd.Series({'Name': 'Helen',
                     'Class': 'Biology',
                     'Score': 90})

df = pd.DataFrame([record1, record2, record3],
                  index=['school1', 'school2', 'school3'])

df.head()


Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [5]:
# example 2

students = [{'Name': 'Alice',
             'Class': 'Physics',
             'Score': 85},
            {'Name': 'Jack',
             'Class': 'Chemistry',
             'Score': 82},
            {'Name': 'Helen',
             'Class': 'Biology',
             'Score': 90}]

df = pd.DataFrame(students, index=['school1', 'school2', 'school3'])

df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [6]:
# example 3

df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [7]:
type(df.loc['school2'])

pandas.core.series.Series

In [8]:
# example 4

df.loc['school1', 'Name']

'Alice'

In [9]:
# example 5

# T ---> transope the matrix.
# This pivots all of the rows into columns and all of the columns into rows, and is done with the T attribute.

df.T

Unnamed: 0,school1,school2,school3
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [10]:
# example 6

df['Name']

school1    Alice
school2     Jack
school3    Helen
Name: Name, dtype: object

In [11]:
df.loc['school1']['Name']

'Alice'

In [12]:
# example 7

# We ask for all the names and scores for all schools using the .loc operator.
df.loc[:,['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school3,Helen,90


In [13]:
# example 8

# The drop function doesn't change the DataFrame by default !
# Instead the drop function returns to you a copy of the DataFrame with the given rows removed.

df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [14]:
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [15]:
# example 9

copy_df = df.copy()

copy_df.drop('Name', inplace=True, axis=1)

copy_df

Unnamed: 0,Class,Score
school1,Physics,85
school2,Chemistry,82
school3,Biology,90


In [16]:
del copy_df['Class']

copy_df

Unnamed: 0,Score
school1,85
school2,82
school3,90


In [17]:
# example 10

# Adding a new column to the DataFrame.

df['ClassRanking'] = None

df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school3,Helen,Biology,90,


#### Section 2.2

( Indexing and Loading )

In [18]:
#Look at the content of a CSV file.

!cat datasets/data.csv

No,First name,Last name,Score
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90
6,first_name6,last_name6,10
7,first_name7,last_name7,80
8,first_name8,last_name8,75
9,first_name9,last_name9,60
10,first_name10,last_name10,35


In [19]:
# example 1

df_csv = pd.read_csv('datasets/data.csv')

df_csv.head()

Unnamed: 0,No,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [20]:
# example 2

pd_csv = pd.read_csv('datasets/data.csv', index_col=0)

pd_csv.head()

Unnamed: 0_level_0,First name,Last name,Score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [21]:
# example 3

# Renaming of one column.

new_df = df_csv.rename(columns={'No': 'Number'})

new_df.head()

Unnamed: 0,Number,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [22]:
# example 4

new_df = df_csv.rename(mapper=str.strip, axis='columns')

new_df.head()


Unnamed: 0,No,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [23]:
df_csv.columns

Index(['No', 'First name', 'Last name', 'Score'], dtype='object')

In [24]:
# example 5

# Changing all of the column names to lower case.

cols = list(df_csv.columns)

cols = [x.lower().strip() for x in cols]

df_csv.columns = cols

df_csv.head()

Unnamed: 0,no,first name,last name,score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


#### Section 2.3

( Querying a DataFrame )

In [25]:
# example 1

df = pd.read_csv('datasets/data.csv', index_col=0)

df.columns = [x.lower().strip() for x in df.columns]

df.head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [26]:
admit_mask = df['score'] > 50

admit_mask

No
1     False
2     False
3     False
4     False
5      True
6     False
7      True
8      True
9      True
10    False
Name: score, dtype: bool

In [27]:
df.where(admit_mask)

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,,
2,,,
3,,,
4,,,
5,first_name5,last_name5,90.0
6,,,
7,first_name7,last_name7,80.0
8,first_name8,last_name8,75.0
9,first_name9,last_name9,60.0
10,,,


In [28]:
# The returned DataFrame now has all of the NaN rows dropped.

df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,first_name5,last_name5,90.0
7,first_name7,last_name7,80.0
8,first_name8,last_name8,75.0
9,first_name9,last_name9,60.0


In [29]:
df[df['score'] > 50].head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,first_name5,last_name5,90
7,first_name7,last_name7,80
8,first_name8,last_name8,75
9,first_name9,last_name9,60


In [30]:
df[['first name', 'score']].head()

Unnamed: 0_level_0,first name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,first_name1,20
2,first_name2,30
3,first_name3,40
4,first_name4,50
5,first_name5,90


In [31]:
# example 2

# use '&' instead 'and'.
# e.g. : (condition 1) & (condition 2).

df['score'].gt(30)

No
1     False
2     False
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
Name: score, dtype: bool

#### Section 2.4

( Indexing Dataframes )

In [32]:
# example 1

df = pd.read_csv('datasets/data.csv', index_col=0)

df.head()

Unnamed: 0_level_0,First name,Last name,Score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [33]:
df['Serial Number'] = df.index

df = df.set_index('Score')

df.head()

Unnamed: 0_level_0,First name,Last name,Serial Number
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,first_name1,last_name1,1
30,first_name2,last_name2,2
40,first_name3,last_name3,3
50,first_name4,last_name4,4
90,first_name5,last_name5,5


In [34]:
# example 2

df['Serial Number'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

### Section 2.5

( Missing Values )

In [35]:
# example 1

df = pd.read_csv('datasets/class_grades.csv')

df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [36]:
mask = df.isnull()

mask.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,True,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [37]:
# Drop all of those rows which have any missing data.

df.dropna().head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [38]:
# example 2

df.fillna(0, inplace=True)

df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,0.0,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [39]:
# example 3

# Sorting.

df = df.set_index('Final')

df = df.sort_index()

df.head(5)

Unnamed: 0_level_0,Prefix,Assignment,Tutorial,Midterm,TakeHome
Final,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
48.89,8,83.7,83.17,0.0,63.15
50.0,8,95.05,102.99,56.25,99.07
50.83,8,84.26,93.1,47.5,18.52
52.5,5,57.14,34.09,64.38,51.48
56.11,7,72.85,86.85,60.0,0.0


In [40]:
# example 4

df = pd.DataFrame({'A': [1, 1, 2, 3, 4],
                   'B': [3, 6, 3, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})

df

Unnamed: 0,A,B,C
0,1,3,a
1,1,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [41]:
df.replace(1, 100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [42]:
df.replace([1, 3], [100, 300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,e


### Section 2.6

( Manipulating DataFrame )

In [43]:
# example 1

def splitname(row):
    
    row['First'] = row['President'].split()[0]

    row['Last'] = row['President'].split()[-1]

    return row

df = pd.read_csv('datasets/presidents.csv')

df = df.apply(splitname, axis='columns')

df


Unnamed: 0,#,President,Born,First,Last
0,1,George Washington,Feb 22 (1732),George,Washington
1,2,John Adams,Oct 30 (1735),John,Adams
2,3,Thomas Jefferson,Apr 13 (1743),Thomas,Jefferson
3,4,James Madison,Mar 16 (1751),James,Madison
4,5,James Monroe,Apr 28 (1758),James,Monroe


In [44]:
# example 2

pattern = "(^[\w]*)(?:.* )([\w]*$)"

df['President'].str.extract(pattern).head()

Unnamed: 0,0,1
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe
