# DataFrame Basic Functionalities

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
dict = {'Name':pd.Series(['John','Anna','Tom','Mary','Steve','Peter','Joe']),
        'Age' :pd.Series([20,np.nan,22,22,20,23,18]),
        'GPA' :pd.Series([3.3,3.6,3.5,3.7,3.0,2.8,2.95])}

In [4]:
# Create a DataFrame
df = pd.DataFrame(dict)
display(df.head(3))

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5


In [5]:
df.tail(3)

Unnamed: 0,Name,Age,GPA
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


In [6]:
# getting 3 random rows
df.sample(3)

Unnamed: 0,Name,Age,GPA
2,Tom,22.0,3.5
3,Mary,22.0,3.7
1,Anna,,3.6


In [7]:
df.shape

(7, 3)

`df.T`: transpose rows and columns

In [8]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
Name,John,Anna,Tom,Mary,Steve,Peter,Joe
Age,20.0,,22.0,22.0,20.0,23.0,18.0
GPA,3.3,3.6,3.5,3.7,3.0,2.8,2.95


In [9]:
df

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5
3,Mary,22.0,3.7
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


`df.axes`: return a list with the row axis labels and column axis labels as the only members.

In [10]:
df.axes

[RangeIndex(start=0, stop=7, step=1),
 Index(['Name', 'Age', 'GPA'], dtype='object')]

`df.index`: return the indices

In [11]:
df.index

RangeIndex(start=0, stop=7, step=1)

In [12]:
# using this approach to modify the indices
df.index = ['a','b','c','d','e','f','g']
display(df)

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7
e,Steve,20.0,3.0
f,Peter,23.0,2.8
g,Joe,18.0,2.95


`df.set_index()`: set an existing column or set of columns as row index

In [13]:
# Setting Age as index
df.set_index('Age')

Unnamed: 0_level_0,Name,GPA
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
20.0,John,3.3
,Anna,3.6
22.0,Tom,3.5
22.0,Mary,3.7
20.0,Steve,3.0
23.0,Peter,2.8
18.0,Joe,2.95


In [14]:
# Setting Nae and Age as index
df.set_index(['Name','Age'])

Unnamed: 0_level_0,Unnamed: 1_level_0,GPA
Name,Age,Unnamed: 2_level_1
John,20.0,3.3
Anna,,3.6
Tom,22.0,3.5
Mary,22.0,3.7
Steve,20.0,3.0
Peter,23.0,2.8
Joe,18.0,2.95


We did not use `inplace=True`.

In [15]:
df

Unnamed: 0,Name,Age,GPA
a,John,20.0,3.3
b,Anna,,3.6
c,Tom,22.0,3.5
d,Mary,22.0,3.7
e,Steve,20.0,3.0
f,Peter,23.0,2.8
g,Joe,18.0,2.95


`df.reset_index()`: reset the index of the DataFrame, and use the default one instead.

In [16]:
df.reset_index()

Unnamed: 0,index,Name,Age,GPA
0,a,John,20.0,3.3
1,b,Anna,,3.6
2,c,Tom,22.0,3.5
3,d,Mary,22.0,3.7
4,e,Steve,20.0,3.0
5,f,Peter,23.0,2.8
6,g,Joe,18.0,2.95


In [17]:
# avoiding the index column
df.reset_index(drop=True, inplace=True)

In [18]:
# verifying
df

Unnamed: 0,Name,Age,GPA
0,John,20.0,3.3
1,Anna,,3.6
2,Tom,22.0,3.5
3,Mary,22.0,3.7
4,Steve,20.0,3.0
5,Peter,23.0,2.8
6,Joe,18.0,2.95


`df.columns`: return a list of columns

In [19]:
df.columns

Index(['Name', 'Age', 'GPA'], dtype='object')

In [20]:
# getting the first column of the DataFrame  
df.columns[0]

'Name'

`df.dtypes`: return the data type of each column.

In [21]:
df.dtypes

Name     object
Age     float64
GPA     float64
dtype: object

`df.empty`: return the Boolean value saying whether the Object is empty or not; True indicates that the object is empty.

In [22]:
df.empty

False

In [23]:
df1 = pd.DataFrame()
df1.empty

True

`df.size`: return the number of elements in the DataFrame (size = rows x columns)

In [24]:
df.size

21

Notice that `df` has 5 rows and 3 columns (3 x 5 = 15)

In [25]:
df.shape

(7, 3)

df1 is an empty DataFrame. It has 0 rows and 0 columns.

In [26]:
df1.size

0

`df.values`: return the actual data in the DataFrame as an ndarray.

In [27]:
v = df.values
v

array([['John', 20.0, 3.3],
       ['Anna', nan, 3.6],
       ['Tom', 22.0, 3.5],
       ['Mary', 22.0, 3.7],
       ['Steve', 20.0, 3.0],
       ['Peter', 23.0, 2.8],
       ['Joe', 18.0, 2.95]], dtype=object)

In [28]:
type(v)

numpy.ndarray

In [29]:
len(v)

7

In [30]:
v[0]

array(['John', 20.0, 3.3], dtype=object)

In [31]:
v.flatten()

array(['John', 20.0, 3.3, 'Anna', nan, 3.6, 'Tom', 22.0, 3.5, 'Mary',
       22.0, 3.7, 'Steve', 20.0, 3.0, 'Peter', 23.0, 2.8, 'Joe', 18.0,
       2.95], dtype=object)

In [32]:
len(v.flatten())

21

`value_counts()`: it It counts the number of occurrences of each distinct value in a column. By default it does not consider the missing values.

In [33]:
df.Age.value_counts()

20.0    2
22.0    2
23.0    1
18.0    1
Name: Age, dtype: int64

In [34]:
# condidering missing values
df.Age.value_counts(dropna=False)

20.0    2
22.0    2
NaN     1
23.0    1
18.0    1
Name: Age, dtype: int64

Reference:
- VanderPlas, J. (2017) Python Data Science Handbook: Essential Tools for Working with Data. USA: O’Reilly Media, Inc. chapter 3