# Pandas

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/pandas.png" alt="numpy logo" width = "300">

[Pandas](https://pandas.pydata.org/) is built on NumPy and provides easy-to-use
data structures and data analysis tools for the Python
programming language.

## Install and import Pandas

`
$ pip install pandas
`

In [None]:
# Import Pandas convention
import pandas as pd

## Pandas Data Structures

**Series**

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/serie.png" alt="numpy logo" >

A **one-dimensional** labeled array a capable of holding any data type.

In [None]:
# Import pandas
import pandas as pd

# Create a pandas Series representing monthly sales data
sales_data = pd.Series(
    [1500, 1200, 1800, 1600, 1300, 1700, 1400, 1500, 1600, 1800],
    index=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct']
)

# Print the pandas Series
print("Monthly Sales Data:")
print(sales_data)

**DataFrame**

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/df.png" alt="numpy logo" >

**two-dimensional** labeled data structure with columns of potentially different types.

In [None]:
# Create a pandas DataFrame with more instances
data = {
    'country': ['United States', 'China', 'Japan', 'Germany', 'United Kingdom', 'India', 'France', 'Italy', 'Brazil', 'Canada'],
    'capital': ['Washington, D.C.', 'Beijing', 'Tokyo', 'Berlin', 'London', 'New Delhi', 'Paris', 'Rome', 'Brasília', 'Ottawa'],
    'population': [331449281, 1393000000, 126476461, 83783945, 67886011, 1303171035, 67186600, 60277900, 211050000, 37742154],
    'GDP': [21.44, 14.34, 5.07, 4.01, 2.99, 3.11, 2.78, 2.15, 1.77, 1.73]
}
df = pd.DataFrame(
    data,
    columns=['country', 'capital', 'population', 'GDP']
)

# Print the DataFrame 'df'
print("\ndf:")
df

## Getting Elements


In [None]:
# Get one element from a Series
sales_data['jan']

# another way to do it
sales_data.jan

In [None]:
# Get subset of a DataFrame
df[1:]

## Selecting, Boolean Indexing & Setting


In [None]:
# Select single value by row & 'Belgium' column
df.iloc[[0],[0]]
# Output: 'Belgium'

In [None]:
# Select single value by row & 'Belgium' column labels
df.loc[[0], ['country']]
# Output: 'Belgium'

In [None]:
# Select single row of subset of rows
df.loc[2]
# print(type(df.loc[2]))
# Output:
# Country     Brazil
# Capital    Brasília
# Population 207847528

In [None]:
# Select a single column of subset of columns
df.loc[:,'capital']
# df['Capital']

# Output:
# 0     Brussels
# 1    New Delhi
# 2     Brasília

In [None]:
# Boolean indexing - Series s where value is not > 1
sales_data[~(sales_data > 1)]

In [None]:
# Boolean indexing - s where value is <-1 or >2
s[(s < -1) | (s > 2)]

In [None]:
# Use filter to adjust DataFrame
df[df['Population'] > 1200000000]

In [None]:
# Setting index a of Series s to 6
sales_data['may'] = 60000
sales_data

## Dropping


In [None]:
# Drop values from rows (axis=0)
sales_data.drop(['may', 'mar'])

In [None]:
# Drop values from columns (axis=1)
df.drop('country', axis=1)

## Sort & Rank


In [None]:
# Sort by labels along an axis
df.sort_index()

In [None]:
# Sort by the values along an axis
df.sort_values(by='country')

In [None]:
# Assign ranks to entries
df.rank()

## Applying Functions


In [None]:
# Define a function
f = lambda x: x*2

In [None]:
# Apply function to DataFrame
df.apply(f)

In [None]:
# Apply function element-wise
df.applymap(f)

In [None]:

df["country"] = df["country"].apply(lambda x: x.upper())

df

## TQDM with pandas

In [None]:
import time
def placeholder_function(x):
    time.sleep(0.5)
    return x.upper()

In [None]:
from tqdm import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

df["country"] = df["country"].progress_apply(placeholder_function)

df

In [None]:
# Even better progress bar
from tqdm.auto import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

df["country"] = df["country"].progress_apply(placeholder_function)

df

## Basic Information


In [None]:
# Get the shape (rows, columns)
df.shape

In [None]:
# Describe index
df.index

In [None]:
# Describe DataFrame columns
df.columns

In [None]:
# Info on DataFrame
df.info()

In [None]:
# Number of non-NA values
df.count()

In [None]:
df["country"].value_counts()

## Summary

In [None]:
# Sum of values
sum_values = df['Population'].sum()

# Cumulative sum of values
cumulative_sum_values = df['Population'].cumsum()

# Minimum/maximum values
min_values = df['Population'].min()
max_values = df['Population'].max()

# Index of minimum/maximum values
idx_min_values = df['Population'].idxmin()
idx_max_values = df['Population'].idxmax()

# Summary statistics
summary_stats = df['Population'].describe()

# Mean of values
mean_values = df['Population'].mean()

# Median of values
median_values = df['Population'].median()

print("Example DataFrame:")
print(df)

print("\nSum of values:")
print(sum_values)

print("\nCumulative sum of values:")
print(cumulative_sum_values)

print("\nMinimum values:")
print(min_values)

print("\nMaximum values:")
print(max_values)

print("\nIndex of minimum values:")
print(idx_min_values)

print("\nIndex of maximum values:")
print(idx_max_values)

print("\nSummary statistics:")
print(summary_stats)

print("\nMean values:")
print(mean_values)

print("\nMedian values:")
print(median_values)


In [None]:
# easier way to get the summaries
df.describe()
# df.describe().T

# Introduction to data profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report",explorative=True)
# profile.to_widgets()
# profile.to_notebook_iframe()
# profile.to_file("your_report.html")

## Internal Data Alignment


In [None]:
# Create Series with different indices
import pandas as pd

# Create a pandas Series representing monthly sales data
sales_data_1 = pd.Series(
    [1500, 1200, 1800, 1600, 1300, 1700, 1400, 1500, 1600, 1800],
    index=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct']
)

# Print the pandas Series
print("Monthly Sales Data:")
print(sales_data)

# Create a pandas Series representing monthly sales data with different values
sales_data_2 = pd.Series(
    [500, 300, 700, 800, 600, 900, 1000, 1100, 1200, 1300],
    index=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct']
)

# Print the pandas Series
print("Monthly Sales Data:")
print(sales_data)

In [None]:
# Add two Series with different indices
result = sales_data_1 + sales_data_2
result

## Arithmetic Operations with Fill Methods

In [None]:
sales_data_1 = pd.Series(
    [1500, 1200, 1800, 1600, 1300, 1400, 1500, 1600, 1800],
    index=['jan', 'feb', 'mar', 'apr',  'jun', 'jul', 'aug', 'sep', 'oct']
)
sales_data_2 = pd.Series(
    [500, 300, 700, 800, 600, 900, 1000, 1100, 1300],
    index=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'oct']
)

# Perform arithmetic operations with fill methods
result_add = sales_data_1.add(sales_data_2, fill_value=0)
result_sub = sales_data_1.sub(sales_data_2, fill_value=2)
result_div = sales_data_1.div(sales_data_2, fill_value=4)
result_mul = sales_data_1.mul(sales_data_2, fill_value=3)

print("result_add:")
print(result_add)

print("\nresult_sub:")
print(result_sub)

print("\nresult_div:")
print(result_div)

print("\nresult_mul:")
print(result_mul)

## Asking For Help

In [None]:
# Display help for a function or object
help(pd.Series.loc)