# DataFrames

In [23]:
import numpy as np
import pandas as pd

In [24]:
# DataFrame:

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Alice"],
    "Age": [25, 30, 35, np.nan, 29, 25],
    "Department": ["HR", "IT", "Finance", "IT", "HR", "HR"],
    "Salary": [50000, 60000, 70000, 62000, np.nan, 50000],
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


# Read data in DataFrame

In [25]:
# Display the first n rows (default = 5) -> df.head()
print("First two rows:")
df.head(2)

First two rows:


Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [26]:
# Displays the last n rows (default = 5) -> df.tail()
print("Display last 3 rows:")
df.tail(3)

Display last 3 rows:


Unnamed: 0,Name,Age,Department,Salary
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


# Indexing iloc

In [27]:
# First row in DataFrame
print("First row:")
df.iloc[0]

First row:


Name            Alice
Age              25.0
Department         HR
Salary        50000.0
Name: 0, dtype: object

In [28]:
print("From row 2 to 4 (last index is not included):")
df.iloc[2:4]

From row 2 to 4 (last index is not included):


Unnamed: 0,Name,Age,Department,Salary
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0


In [29]:
# Reverse the dataframe
print("Reverse Dataframe:")
df.iloc[::-1]

Reverse Dataframe:


Unnamed: 0,Name,Age,Department,Salary
5,Alice,25.0,HR,50000.0
4,Eve,29.0,HR,
3,David,,IT,62000.0
2,Charlie,35.0,Finance,70000.0
1,Bob,30.0,IT,60000.0
0,Alice,25.0,HR,50000.0


In [33]:
print("Rows and Columns:")
df.iloc[1:3, :2]  # rows, columns

Rows and Columns:


Unnamed: 0,Name,Age
1,Bob,30.0
2,Charlie,35.0


# Indexing using loc

In [30]:
# First row
print("First row using loc")
df.loc[0]

First row using loc


Name            Alice
Age              25.0
Department         HR
Salary        50000.0
Name: 0, dtype: object

In [31]:
# From index 3 to 5 (last index is not included)
print("From index 3 to 5:")
df.loc[3:5]

From index 3 to 5:


Unnamed: 0,Name,Age,Department,Salary
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [32]:
# Fetch index 3 to 5 and display only Name and salary
print("Fetch index 3 to 5 and display only `Name` and `salary`:")
df.loc[3:5, ["Name", "Salary"]]

Fetch index 3 to 5 and display only `Name` and `salary`:


Unnamed: 0,Name,Salary
3,David,62000.0
4,Eve,
5,Alice,50000.0


# Without iloc and loc

In [34]:
print("Access just one column:")
df["Name"]

Access just one column:


0      Alice
1        Bob
2    Charlie
3      David
4        Eve
5      Alice
Name: Name, dtype: object

In [35]:
print("Access multiple columns:")
df[["Name", "Salary"]]

Access multiple columns:


Unnamed: 0,Name,Salary
0,Alice,50000.0
1,Bob,60000.0
2,Charlie,70000.0
3,David,62000.0
4,Eve,
5,Alice,50000.0


# Drop Columns

In [37]:
# Drop Age column

# NOTE: The axis for row is 0
#       The axis for column is 1
# So, the Age is a column, we must use pass axis = 1 to drop column ==> df.drop(column_name, axis, inplace)
# Here inplace will perform the operation in the original DataFrame. The default value is `False` which means the changes will be temporary.

print("Drop Age column temporarily:")
df.drop("Age", axis=1)

Drop Age column temporarily:


Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000.0
1,Bob,IT,60000.0
2,Charlie,Finance,70000.0
3,David,IT,62000.0
4,Eve,HR,
5,Alice,HR,50000.0


# Other important functions and attributes

In [38]:
print("Shape of the DataSet:", df.shape)

Shape of the DataSet: (6, 4)


In [39]:
print("Information about the DataFrame:")
df.info()

Information about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        6 non-null      object 
 1   Age         5 non-null      float64
 2   Department  6 non-null      object 
 3   Salary      5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 324.0+ bytes


In [40]:
df.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,28.8,58400.0
std,4.147288,8532.291603
min,25.0,50000.0
25%,25.0,50000.0
50%,29.0,60000.0
75%,30.0,62000.0
max,35.0,70000.0
