# DataFrame Basic Functionalities

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
dict = {'Name':pd.Series(['John','Anna','Tom','Mary','Steve','Peter','Joe']),
        'Age' :pd.Series([20,np.nan,22,22,20,23,18]),
        'GPA' :pd.Series([3.3,3.6,3.5,3.7,3.0,2.8,2.95])}

In [4]:
# Create a DataFrame
df = pd.DataFrame(dict)
display(df.head(3))

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5


In [5]:
# showing the last 3 rows
df.tail(3)

Unnamed: 0,Name,Age,GPA
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


In [6]:
# getting 3 random rows
df.sample(3)

Unnamed: 0,Name,Age,GPA
2,Tom,22.0,3.5
3,Mary,22.0,3.7
0,John,20.0,3.3


In [7]:
df.shape

(7, 3)

`df.T`: transpose rows and columns

In [8]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
Name,John,Anna,Tom,Mary,Steve,Peter,Joe
Age,20.0,,22.0,22.0,20.0,23.0,18.0
GPA,3.3,3.6,3.5,3.7,3.0,2.8,2.95


`df.axes`: return a list with the row axis labels and column axis labels as the only members.

In [9]:
display(df)
df.axes

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5
3,Mary,22.0,3.7
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


[RangeIndex(start=0, stop=7, step=1),
 Index(['Name', 'Age', 'GPA'], dtype='object')]

`df.index`: return the indices

In [10]:
df.index

RangeIndex(start=0, stop=7, step=1)

In [11]:
# using this approach to modify the indices
df.index = ['a','b','c','d','e','f','g']
display(df)

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7
e,Steve,20.0,3.0
f,Peter,23.0,2.8
g,Joe,18.0,2.95


`df.set_index()`: set an existing column or set of columns as row index

In [12]:
# Setting Age as index
df.set_index('Age')

Unnamed: 0_level_0,Name,GPA
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
20.0,John,3.3
,Anna,3.6
22.0,Tom,3.5
22.0,Mary,3.7
20.0,Steve,3.0
23.0,Peter,2.8
18.0,Joe,2.95


In [13]:
# Setting Name and Age as index
df.set_index(['Name','Age'])

Unnamed: 0_level_0,Unnamed: 1_level_0,GPA
Name,Age,Unnamed: 2_level_1
John,20.0,3.3
Anna,,3.6
Tom,22.0,3.5
Mary,22.0,3.7
Steve,20.0,3.0
Peter,23.0,2.8
Joe,18.0,2.95


We did not use `inplace=True`, df remains the same.

In [14]:
df

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7
e,Steve,20.0,3.0
f,Peter,23.0,2.8
g,Joe,18.0,2.95


`df.reset_index()`: reset the index of the DataFrame, and use the default one instead.

In [15]:
df.reset_index()

Unnamed: 0,index,Name,Age,GPA
0,a,John,20.0,3.3
1,b,Anna,,3.6
2,c,Tom,22.0,3.5
3,d,Mary,22.0,3.7
4,e,Steve,20.0,3.0
5,f,Peter,23.0,2.8
6,g,Joe,18.0,2.95


In [16]:
# avoiding the index column
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5
3,Mary,22.0,3.7
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


`df.columns`: return a list of columns

In [17]:
df.columns

Index(['Name', 'Age', 'GPA'], dtype='object')

`df.dtypes`: return the data type of each column.

In [18]:
df.dtypes

Name     object
Age     float64
GPA     float64
dtype: object

`df.empty`: return the Boolean value saying whether the Object is empty or not; True indicates that the object is empty.

In [19]:
df.empty

False

`df.size`: return the number of elements in the DataFrame (size = rows x columns)

In [20]:
df.size

21

In [21]:
df.shape

(7, 3)

`df.values`: return the actual data in the DataFrame as an ndarray.

In [22]:
v = df.values
v

array([['John', 20.0, 3.3],
       ['Anna', nan, 3.6],
       ['Tom', 22.0, 3.5],
       ['Mary', 22.0, 3.7],
       ['Steve', 20.0, 3.0],
       ['Peter', 23.0, 2.8],
       ['Joe', 18.0, 2.95]], dtype=object)

In [23]:
type(v)

numpy.ndarray

In [24]:
v.flatten()

array(['John', 20.0, 3.3, 'Anna', nan, 3.6, 'Tom', 22.0, 3.5, 'Mary',
       22.0, 3.7, 'Steve', 20.0, 3.0, 'Peter', 23.0, 2.8, 'Joe', 18.0,
       2.95], dtype=object)

`value_counts()`: it It counts the number of occurrences of each distinct value in a column. By default it does not consider the missing values.

In [25]:
df.Age.value_counts()

20.0    2
22.0    2
23.0    1
18.0    1
Name: Age, dtype: int64

In [26]:
# condidering missing values
df.Age.value_counts(dropna=False)

20.0    2
22.0    2
NaN     1
23.0    1
18.0    1
Name: Age, dtype: int64

## Indexing and selecting data

The Python and NumPy indexing operators **`[ ]`** and attribute operator **`.`** provide quick and easy access to Pandas data structures across a wide range of use cases. However, since the type of the data to be accessed isn’t known in advance, directly using standard operators has some optimization limits. 

`iloc` function can be used to locate a row based on numerical indices even if the label to that is different. In contrast, the `loc` function selects data based on the label of the data in the DataFrame. 

`iloc()`: get purely integer based indexing. Remember these are 0-based indexing. It can only index by location with a integer, integer slice (START point is INCLUDED, END point is EXCLUDED).

In [27]:
df.index = ['a','b','c','d','e','f','g']
display(df)

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7
e,Steve,20.0,3.0
f,Peter,23.0,2.8
g,Joe,18.0,2.95


In [28]:
# Select a column
df['Age']

a    20.0
b     NaN
c    22.0
d    22.0
e    20.0
f    23.0
g    18.0
Name: Age, dtype: float64

In [29]:
# Selecting more than one column
df[['Age','GPA']]

Unnamed: 0,Age,GPA
a,20.0,3.3
b,,3.6
c,22.0,3.5
d,22.0,3.7
e,20.0,3.0
f,23.0,2.8
g,18.0,2.95


In [30]:
# Using iloc
df.iloc[:,[1,2]]

Unnamed: 0,Age,GPA
a,20.0,3.3
b,,3.6
c,22.0,3.5
d,22.0,3.7
e,20.0,3.0
f,23.0,2.8
g,18.0,2.95


In [31]:
# Selecting a few rows
df[:'d']

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7


In [32]:
# Selecting a few rows using loc
df.loc[:'d']

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7


In [33]:
# Selecting a few rows using iloc
df.iloc[:4]

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7


In [34]:
# Selecting a few rows and columns
df[:3][['Age','GPA']]

Unnamed: 0,Age,GPA
a,20.0,3.3
b,,3.6
c,22.0,3.5


In [35]:
# Selecting a few rows and columns using loc
df.loc[:'c'][['Age','GPA']]

Unnamed: 0,Age,GPA
a,20.0,3.3
b,,3.6
c,22.0,3.5


In [36]:
# Selecting a few rows and columns using iloc
df.iloc[:3,[1,2]]

Unnamed: 0,Age,GPA
a,20.0,3.3
b,,3.6
c,22.0,3.5


Reference:
- VanderPlas, J. (2017) Python Data Science Handbook: Essential Tools for Working with Data. USA: O’Reilly Media, Inc. chapter 3