<a href="https://colab.research.google.com/github/Misudhari/Applied-Data-Science-with-Python/blob/main/Dataframes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#DataFrames

In [1]:
# Dataframe is a 2D Series object
# it consists of index and multiple columns, with each column having label.

import pandas as pd

In [2]:
# lets create 3 school records for students and thier class grades.
record1 = pd.Series({
    'Name'  : 'Alice',
    'Class' : 'Physics',
    'Score' : 85
})

record2 = pd.Series({
    'Name'  : 'Jack',
    'Class' : 'Chemistry',
    'Score' : 82
})

record3 = pd.Series({
    'Name'  : 'Helen',
    'Class' : 'Biology',
    'Score' : 90
})

In [3]:
# lets create a dataframe and add these records to that datatframe
df = pd.DataFrame([record1, record2, record3],
                  index = ['school1', 'school2', 'school1'])

# Lets show the dataframe using head()
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [10]:
# An alternative way is we can also create a list of dictionaries and pass it to the DataFrame() function

Students = [{ 'Name'   : 'Alice'  ,
              'Class'  : 'Physics',
              'Score'  :  85},
            { 'Name'   : 'Jack'     ,
              'Class'  : 'Chemistry',
              'Score'  :  82},
            { 'Name'   :  'Helen',
              'Class'  :  'Biology',
              'Score'  :  90}]

# lets add this to DataFrame()
df = pd.DataFrame(Students, index = ['School1', 'School2', 'School1'])
df.head()

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [11]:
# Similar to Series we can also extrct data using loc and iloc
df.loc['School2']

Name          Jack
Class    Chemistry
Score           82
Name: School2, dtype: object

In [12]:
type(df.loc['School2'])

pandas.core.series.Series

In [13]:
# if index or column are non-unique then above operation will return a Series as a new DataFrame
df.loc['School1']

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School1,Helen,Biology,90


In [14]:
type(df.loc['School1'])

pandas.core.frame.DataFrame

In [15]:
# If we are only interested in School1 Student names then

df.loc['School1', 'Name']

School1    Alice
School1    Helen
Name: Name, dtype: object

In [16]:
# We can Transpose the matrix. This pivots all the Rows into Columns and Columns into Rows.
# use T attribute
df.T

Unnamed: 0,School1,School2,School1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [17]:
# Transpose student names only then
df.T.loc['Name']

School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object

In [18]:
# Howerver since iloc and loc is used for row selection Pandas reserves the 
# indexing operator directly on the DataFrame for column selection, in a Pandas 
# Dataframe a column always have a name. So its selection is always label based.

df['Name']

School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object

In [19]:
# This means if we try to use iloc with column name it gives key error
df.loc['Name']

KeyError: ignored

In [21]:
# Note that the result of single column projection is a Series Object
type(df['Name'])

pandas.core.series.Series

In [23]:
# Since the result of using the indexing operator is either a DataFrame or 
# Series, we can chain operation together.
# We can select all the rows related to School1 using .loc then project the name
# column from just those rows
df.loc['School1']['Name']

School1    Alice
School1    Helen
Name: Name, dtype: object

In [24]:
print(type(df.loc['School1']))# This should be a DataFrame
print(type(df.loc['School1']['Name']))# This should be a Series

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [25]:
# This kind of chaining can come with some cost and best be avoided if you can
# use another approach
# In perticular chaining cause Pandas to return a copy of the DataFrame instead
# of the View on DataFrame.
# If you are changing data using this then you should avoid this method.

In [31]:
# .loc does row selection and it can take 2 parameters. The row index and the
# list of column names.
# The .loc also supports slicing. So we can use it to do the same instead of 
# using chaining.

print(type(df.loc[:,['Name','Score']]))
df.loc[:,['Name','Score']]

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,Score
School1,Alice,85
School2,Jack,82
School1,Helen,90


In [28]:
# The : means we want to get all of the rows, and the list in the second 
# argument is the list we want ot get back

In [29]:
# Key points to remeber

# Rows and Columns are just ofr our benefits. Underneath this is just a two axes
# labeled array

# It is easy to transpose matrix

# Use chaining very carefully.

In [32]:
# drop function to use delete. 
# it dosen't change the DataFrame itself.
# it returns copy of the DataFrame with the given rows removed.
df.drop('School1') 

Unnamed: 0,Name,Class,Score
School2,Jack,Chemistry,82


In [33]:
df

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [37]:
# Drop has 2 interesting optional arguments.
# 1st - is inplace.
# if inplace=True then it will update the DataFrame instead of returning a copy.

# 2nd - is axes which should be dropped default = 0 indicating row axis

copy_df = df.copy()
copy_df

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [38]:
copy_df.drop("Name", inplace=True, axis = 1)
copy_df

Unnamed: 0,Class,Score
School1,Physics,85
School2,Chemistry,82
School1,Biology,90


In [39]:
# another way is by using del but this directly deletes the recoed and doesn not
# return a copy
del copy_df['Class']

In [40]:
copy_df

Unnamed: 0,Score
School1,85
School2,82
School1,90


In [41]:
# Adding a new column is easy
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
School1,Alice,Physics,85,
School2,Jack,Chemistry,82,
School1,Helen,Biology,90,
