### Get started with _DataFrame_ !

#### Section 2.1

In [2]:
# example 1

import pandas as pd

record1 = pd.Series({'Name': 'Alice',
                     'Class': 'Physics',
                     'Score': 85})

record2 = pd.Series({'Name': 'Jack',
                     'Class': 'Chemistry',
                     'Score': 82})

record3 = pd.Series({'Name': 'Helen',
                     'Class': 'Biology',
                     'Score': 90})

df = pd.DataFrame([record1, record2, record3],
                  index=['school1', 'school2', 'school3'])

df.head()


Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [43]:
# example 2

students = [{'Name': 'Alice',
             'Class': 'Physics',
             'Score': 85},
            {'Name': 'Jack',
             'Class': 'Chemistry',
             'Score': 82},
            {'Name': 'Helen',
             'Class': 'Biology',
             'Score': 90}]

df = pd.DataFrame(students, index=['school1', 'school2', 'school3'])

df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [44]:
# example 3

df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [45]:
type(df.loc['school2'])

pandas.core.series.Series

In [46]:
# example 4

df.loc['school1', 'Name']

'Alice'

In [47]:
# example 5

# T ---> transope the matrix.
# This pivots all of the rows into columns and all of the columns into rows, and is done with the T attribute.

df.T

Unnamed: 0,school1,school2,school3
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [48]:
# example 6

df['Name']

school1    Alice
school2     Jack
school3    Helen
Name: Name, dtype: object

In [49]:
df.loc['school1']['Name']

'Alice'

In [50]:
# example 7

# We ask for all the names and scores for all schools using the .loc operator.
df.loc[:,['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school3,Helen,90


In [65]:
# example 8

# The drop function doesn't change the DataFrame by default !
# Instead the drop function returns to you a copy of the DataFrame with the given rows removed.

df.drop('school1')

Unnamed: 0,Name,Class,Score,ClassRanking
school2,Jack,Chemistry,82,
school3,Helen,Biology,90,


In [66]:
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school3,Helen,Biology,90,


In [67]:
# example 9

copy_df = df.copy()

copy_df.drop('Name', inplace=True, axis=1)

copy_df

Unnamed: 0,Class,Score,ClassRanking
school1,Physics,85,
school2,Chemistry,82,
school3,Biology,90,


In [68]:
del copy_df['Class']

copy_df

Unnamed: 0,Score,ClassRanking
school1,85,
school2,82,
school3,90,


In [57]:
# example 10

# Adding a new column to the DataFrame.

df['ClassRanking'] = None

df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school3,Helen,Biology,90,


#### Section 2.2

( Indexing and Loading )

In [206]:
#Look at the content of a CSV file.

!cat datasets/data.csv

No,First name,Last name,Score
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90
6,first_name6,last_name6,10
7,first_name7,last_name7,80
8,first_name8,last_name8,75
9,first_name9,last_name9,60
10,first_name10,last_name10,35


In [207]:
# example 1

df_csv = pd.read_csv('datasets/data.csv')

df_csv.head()

Unnamed: 0,No,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [208]:
# example 2

pd_csv = pd.read_csv('datasets/data.csv', index_col=0)

pd_csv.head()

Unnamed: 0_level_0,First name,Last name,Score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [209]:
# example 3

# Renaming of one column.

new_df = df_csv.rename(columns={'No': 'Number'})

new_df.head()

Unnamed: 0,Number,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [210]:
# example 4

new_df = df_csv.rename(mapper=str.strip, axis='columns')

new_df.head()


Unnamed: 0,No,First name,Last name,Score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


In [211]:
df_csv.columns

Index(['No', 'First name', 'Last name', 'Score'], dtype='object')

In [212]:
# example 5

# Changing all of the column names to lower case.

cols = list(df_csv.columns)

cols = [x.lower().strip() for x in cols]

df_csv.columns = cols

df_csv.head()

Unnamed: 0,no,first name,last name,score
0,1,first_name1,last_name1,20
1,2,first_name2,last_name2,30
2,3,first_name3,last_name3,40
3,4,first_name4,last_name4,50
4,5,first_name5,last_name5,90


#### Section 2.3

( Querying a DataFrame )

In [213]:
# example 1

df = pd.read_csv('datasets/data.csv', index_col=0)

df.columns = [x.lower().strip() for x in df.columns]

df.head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [214]:
admit_mask = df['score'] > 50

admit_mask

No
1     False
2     False
3     False
4     False
5      True
6     False
7      True
8      True
9      True
10    False
Name: score, dtype: bool

In [215]:
df.where(admit_mask)

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,,
2,,,
3,,,
4,,,
5,first_name5,last_name5,90.0
6,,,
7,first_name7,last_name7,80.0
8,first_name8,last_name8,75.0
9,first_name9,last_name9,60.0
10,,,


In [216]:
# The returned DataFrame now has all of the NaN rows dropped.

df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,first_name5,last_name5,90.0
7,first_name7,last_name7,80.0
8,first_name8,last_name8,75.0
9,first_name9,last_name9,60.0


In [217]:
df[df['score'] > 50].head()

Unnamed: 0_level_0,first name,last name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,first_name5,last_name5,90
7,first_name7,last_name7,80
8,first_name8,last_name8,75
9,first_name9,last_name9,60


In [218]:
df[['first name', 'score']].head()

Unnamed: 0_level_0,first name,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,first_name1,20
2,first_name2,30
3,first_name3,40
4,first_name4,50
5,first_name5,90


In [219]:
# example 2

# use '&' instead 'and'.
# e.g. : (condition 1) & (condition 2).

df['score'].gt(30)

No
1     False
2     False
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
Name: score, dtype: bool

#### Section 2.4

( Indexing Dataframes )

In [258]:
# example 1

df = pd.read_csv('datasets/data.csv', index_col=0)

df.head()

Unnamed: 0_level_0,First name,Last name,Score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,first_name1,last_name1,20
2,first_name2,last_name2,30
3,first_name3,last_name3,40
4,first_name4,last_name4,50
5,first_name5,last_name5,90


In [259]:
df['Serial Number'] = df.index

df = df.set_index('Score')

df.head()

Unnamed: 0_level_0,First name,Last name,Serial Number
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,first_name1,last_name1,1
30,first_name2,last_name2,2
40,first_name3,last_name3,3
50,first_name4,last_name4,4
90,first_name5,last_name5,5


In [260]:
# example 2

df['Serial Number'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)