In [9]:
import pandas as pd
import numpy as np

In [10]:
brics = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/brics.csv", index_col=0) # OR: index_col=["col0_name"]
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.516,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


# Data Manipulations With Pandas

In [11]:
# df.head() returns the first few rows (5 by default)
print(brics.head(1), '\n')

# df.tail() returns the last few rows (5 by default)
print(brics.tail(1), '\n')

# df.info() shows information on each of the columns, such as the data type and number of missing values
print(brics.info(), '\n')

# df.shape returns the number of rows and columns of the DataFrame as a tuple
print(brics.shape, '\n')

# df.describe() calculates a few summary statistics for each numerical column
print(brics.describe(), '\n')

# df["categorical"].describe()
brics["country"].describe()

   country   capital   area  population
BR  Brazil  Brasilia  8.516       200.4 

         country   capital   area  population
SA  South Africa  Pretoria  1.221       52.98 

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, BR to SA
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     5 non-null      object 
 1   capital     5 non-null      object 
 2   area        5 non-null      float64
 3   population  5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 200.0+ bytes
None 

(5, 4) 

            area   population
count   5.000000     5.000000
mean    7.944000   601.176000
std     6.200557   645.261454
min     1.221000    52.980000
25%     3.286000   143.500000
50%     8.516000   200.400000
75%     9.597000  1252.000000
max    17.100000  1357.000000 



count          5
unique         5
top       Brazil
freq           1
Name: country, dtype: object

In [12]:
# df.values: A two-dimensional NumPy array of values
print(brics.values, '\n')
# equivalent to: `print(brics.to_numpy(), '\n)` → (recommended over `brics.values`)
# `.to_numpy()` allows you to create a completely independent copy of the `.values` attribute
# 	it also allows you to set a certain `dtype` and to fill missing values with a particular value

# df.columns: column names
print(brics.columns, '\n')

# df.index: either row numbers or row names (row labels)
print(brics.index, '\n')

[['Brazil' 'Brasilia' 8.516 200.4]
 ['Russia' 'Moscow' 17.1 143.5]
 ['India' 'New Delhi' 3.286 1252.0]
 ['China' 'Beijing' 9.597 1357.0]
 ['South Africa' 'Pretoria' 1.221 52.98]] 

Index(['country', 'capital', 'area', 'population'], dtype='object') 

Index(['BR', 'RU', 'IN', 'CH', 'SA'], dtype='object') 



In [13]:
print(brics['area'].values.dtype)
print(brics['country'].values.dtype)
print(type(brics.values[0, 0]))
print(type(brics.values[0, 3]))

brics.to_numpy() # OR: brics.values
# Mixed data types in a DataFrame result in an `object` array to accommodate all types.
# The `object` data type in NumPy allows the array to hold different types of data, 
# but it sacrifices some of the performance benefits (consumes large amount of memory)

float64
object
<class 'str'>
<class 'float'>


array([['Brazil', 'Brasilia', 8.516, 200.4],
       ['Russia', 'Moscow', 17.1, 143.5],
       ['India', 'New Delhi', 3.286, 1252.0],
       ['China', 'Beijing', 9.597, 1357.0],
       ['South Africa', 'Pretoria', 1.221, 52.98]], dtype=object)

In [14]:
np.array(['Brazil', 'Brasilia', 8.516, 200.4])

array(['Brazil', 'Brasilia', '8.516', '200.4'], dtype='<U32')

In [15]:
np.array(['Brazil', 'Brasilia', 8.516, 200.4], dtype='object')
# we can store different data types in a numpy array using `dtype='object'`, 
# but this isn't preferred as `object` dtype takes huge amount of memory 

array(['Brazil', 'Brasilia', 8.516, 200.4], dtype=object)

# Viewing Numerical & Non-numerical Columns

In [16]:
books = pd.read_csv(r"https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/books.csv")
books.head()

Unnamed: 0,name,author,rating,year,genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,2019,Childrens


In [17]:
# We can select and view only the numerical columns in a DataFrame by calling the select_dtypes method and passing "number" as the argument.
books.select_dtypes("number")

Unnamed: 0,rating,year
0,4.7,2016
1,4.6,2011
2,4.7,2018
3,4.7,2017
4,4.8,2019
...,...,...
345,4.4,2012
346,4.5,2012
347,4.2,2010
348,4.8,2013


In [18]:
# We can select and view only the non-numerical columns in a DataFrame by calling the select_dtypes method and passing "object" as the argument.
books.select_dtypes("object")

Unnamed: 0,name,author,genre
0,10-Day Green Smoothie Cleanse,JJ Smith,Non Fiction
1,11/22/63: A Novel,Stephen King,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,Non Fiction
3,1984 (Signet Classics),George Orwell,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,Childrens
...,...,...,...
345,Wild: From Lost to Found on the Pacific Crest ...,Cheryl Strayed,Non Fiction
346,Winter of the World: Book Two of the Century T...,Ken Follett,Fiction
347,Women Food and God: An Unexpected Path to Almo...,Geneen Roth,Non Fiction
348,Wonder,R. J. Palacio,Fiction


# Number of Unique Values

In [19]:
# We can count how many unique job titles there are using .nunique() method
print(books["genre"].unique())
print(books["genre"].nunique()) 

['Non Fiction' 'Fiction' 'Childrens']
3


### `.pct_change()` calculates '(new - old) / old' over the whole Series

In [20]:
s = pd.Series([1,2,3,5,4])
s.pct_change() 
# e.g., from 1 to 2, the percent change is 100%

0         NaN
1    1.000000
2    0.500000
3    0.666667
4   -0.200000
dtype: float64