In [10]:
import pandas as pd

In [None]:
## Indexing & Slicing
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, 90, 78],
    'City': ['Cairo', 'London', 'Paris']
}

df = pd.DataFrame(data)

# Setting an Index
df.set_index('Name')  # # Set 'Name' as the index
df.reset_index(drop=True)  # reset index to 0, 1, 2, `drop = True` to drop the `index` column
df.reset_index().drop('index', axis=1)  # reset index to 0, 1, 2, drop `index` as a column
# The axis=1 is necessary to tell Pandas: “I want to drop column called 'index', not row with this name.”


# Slicing Rows (loc and iloc)
df = df.set_index('Name')
df.loc['Alice':'Bob']  # in 'Name' is set to be index, we can use it in slicing
df.reset_index(inplace=True)
df.iloc[0:2]


# Slicing Columns
df.loc[:, ['Name', 'Score']] 
df.iloc[:, 0:2]  # we can use 0, 1 as columns' index with `iloc`


# Selecting Rows & Columns Together
df.iloc[0:2, 0:2]  # rows:columns
df = df.set_index('Name')  # we need to set name first as an index
df.loc['Alice', 'Score']  # 85, 'Alice' here is an index
df.reset_index(inplace=True)  # reset index back to 0, 1, 2



In [None]:
## Handling Missing Data in Pandas

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, None, 78],
    'City': ['Cairo', 'London', 'Paris']
}

df = pd.DataFrame(data)

# Detecting Missing Values
df.isnull()   # Shows True/False for every cell
df.notnull()  # Shows True/False for every cell
df.isnull().sum()   # Total missing values per column

# Dropping Missing Data (dropna())
# how='all' → drop rows where all values are missing.
# how='any' →  Drop row/column if it contains any null
# thresh=n → keep rows with at least n valid (non-null) values.
df.dropna()  # drop rows with missing values
df.dropna(axis=1)  # drop columns with missing values

# Filling Missing Data (fillna())
# df.fillna(9)  # fill missing data with 9
# df.ffill()  # fill with the previous value on the column "85" 
# df.bfill()  # fill with the next value on the column "78"

# Replace Missing Values with Mean/Median
# mean = df['Score'].mean()
# df['Score'] = df['Score'].fillna(mean)


# Check for Any Missing Values (Boolean check)
df.isnull().values.any()


np.True_

In [13]:
## Aggregation & Grouping (groupby)

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
    'City': ['Cairo', 'London', 'Cairo', 'London', 'London'],
    'Score': [85, 90, 78, 95, 88]
}

df = pd.DataFrame(data)

# Simple GroupBy + Aggregation
df.groupby('City')['Score'].mean()  # Group by 'City' → get average 'Score'

# Using .agg() for Multiple Aggregations
df.groupby('City')['Score'].agg(['mean', 'min', 'max', 'std', 'median'])  # many stats at once

# Grouping by Multiple Columns
# You’re selecting just one column ('Score') → result is a Series.
# The MultiIndex (City, Name) becomes the index of this Series.
df.groupby(['City', 'Name'])['Score'].mean()
# Always consider reset_index() for clean output.
df.groupby(['City', 'Name'])['Score'].mean().reset_index()  # give index 0, 1, 2 to the new grouped data

# Sorting GroupBy Results
df.groupby(['City', 'Name'])['Score'].mean().sort_values(ascending=False)  # sort Series "['Score']" -> one bracket
df.groupby(['City', 'Name'])[['Score']].mean().reset_index().sort_values(by='Score', ascending=False)  # sort Data Frame "[['Score']]" -> two bracket


Unnamed: 0,City,Name,Score
2,London,Alice,95.0
3,London,Bob,89.0
0,Cairo,Alice,85.0
1,Cairo,Charlie,78.0


In [14]:
## Merging & Joining DataFrames

# pd.merge() — SQL-style Join
# Merging adds columns horizontally → it’s a horizontal combination.
# Example: Customers Table
customers = pd.DataFrame({
    'CustomerID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
})

# Example: Orders Table
orders = pd.DataFrame({
    'OrderID': [101, 102, 103],
    'CustomerID': [1, 2, 2],
    'Amount': [250, 150, 300]
})
pd.merge(customers, orders, on='CustomerID', how='inner')
# merge types -> inner 'default', outer, right, left
pd.merge(customers, orders, right_on='CustomerID', left_on='CustomerID') # Use left_on and right_on when the column names are different.

"""
In Pandas, you can merge on indexes instead of merging on columns.
left_index=True → tells Pandas:
👉 "In the left DataFrame, use the index as the join key."
right_index=True → tells Pandas:
👉 "In the right DataFrame, use the index as the join key."
✅ Both must be True if you want to merge on both indexes.
"""
pd.merge(customers.set_index('CustomerID'), orders.set_index('CustomerID'), left_index=True, right_index=True)  # index-based merging.


# Concatenation (pd.concat()) — Stack Vertically or Horizontally
# - pd.concat() = stacking:
#   - axis=0 → vertical (rows added) 'default'
#   - axis=1 → horizontal (columns added)
df1 = pd.DataFrame({'Name': ['Alice', 'Bob']}, index=['a', 'b'])
df1
df2 = pd.DataFrame({'Name': ['Charlie', 'David']}, index=['a', 'b'])
pd.concat([df1, df2], ignore_index=True)  # ignore_index: if True, the indexes from the source objects will be ignored and a sequence of indexes from 0,1,2…n will be assigned to the result.

pd.concat([df1, df2], axis=1)  # stack it horizontally


Unnamed: 0,Name,Name.1
a,Alice,Charlie
b,Bob,David


In [15]:
## Working with Dates & Times

# Creating Date Columns & Converting to datetime
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Date': ['2023-05-01', '2023-05-02', '2023-05-03'],
    'Score': [85, 90, 78]
}

df = pd.DataFrame(data)
df.dtypes
df['Date'] = pd.to_datetime(df['Date'])  # always convert date columns to Pandas’ datetime type so you can use date functions on this column.
df.dtypes

# Extracting Date Parts (Year, Month, Day, etc.)
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['weekDay'] = df['Date'].dt.day_name()
# df['Hour'] = df['Date'].dt.hour   # no time provided so result = 0
# df['Minute'] = df['Date'].dt.minute  # no time provided so result = 0
df['monthEndCheck'] = df['Date'].dt.is_month_end  # True/False if end of month

# Filtering by Date Range
df[df['Date'] >= '2023-05-2']

# Setting Date as Index (Time Series)
df = df.set_index('Date')  # useful for date slicing
df.loc['2023-05-01':'2023-05-02']

# Creating Date Ranges (Useful for Empty Time Series)

# pd.date_range() creates a DatetimeIndex (a sequence of dates)
# freq controls the spacing of the dates generated (daily, monthly, yearly, etc.)
# Result is not a DataFrame, but can be used to create one.
dates = pd.date_range(start='2023-01-01', end='2023-01-2', freq='D')
df = pd.DataFrame({'Date': dates, 'Value': [100, 200]})  # result are 2 days and 2 values 100, 200, so they have to be at the same length


# pd.date_range(start='2023-01-01', periods=5, freq='ME')  # create a DatetimeIndex of 5 items of 5 months , ME for end of month ['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30', '2023-05-31']
# pd.date_range(start='2023-01-01', periods=4, freq='QE')  # create a DatetimeIndex of 4 year quarters, end of each ['2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31']

In [16]:
## Resampling & Time Series

# Converting to Datetime:
# To work with time series, you must convert your column to datetime

df = pd.DataFrame({
    'Date': ['2022-01-01', '2022-01-02', '2022-01-03'],
    'Value': [10, 20, 30]
})
df['Date'] = pd.to_datetime(df['Date'])  # This makes sure Pandas recognizes it as date/time not just plain text
# .resample() requires the DataFrame to have a DatetimeIndex.
# Use .set_index('Date') before resampling.
df = df.set_index('Date')

# Resampling (Downsampling / Upsampling):
# 👉 Resampling means changing the frequency of your data:
    # Downsampling → From daily → monthly/yearly (aggregate)
    # Upsampling → From monthly → daily (fill gaps)

"""
| Term             | Meaning                                       | Example                                |
| ---------------- | --------------------------------------------- | -------------------------------------- |
| **Downsampling** | Going from **high frequency → low frequency** | From **daily → monthly** (`'D' → 'M'`) |
| **Upsampling**   | Going from **low frequency → high frequency** | From **monthly → daily** (`'M' → 'D'`) |

✅ This is exactly like zooming out (downsampling) or in (upsampling) on a timeline.


"""

# Downsampling:
    # You reduce the number of rows: from daily → monthly.
    # You need an aggregation function (mean(), sum(), max(), etc.) because you’re combining multiple days into one row.

# 1. Resample by Month End → Monthly mean (at end of month)
df.resample('ME').mean()
# 2. Resample by Month Start → Monthly mean (at start of month)
df.resample('MS').mean()
# 3. Resample by Year Start → Yearly max values (at start of year)
df.resample('YS').max()
# 4. Resample by Year End → Yearly max values (at end of year)
df.resample('YE').max()


# Upsampling → Expanding to Higher Frequency
df = pd.DataFrame({
    'Date': ['2022', '2023', '2024'],
    'Value': [10, 20, 30]
})
df['Date'] = pd.to_datetime(df['Date'])  # This makes sure Pandas recognizes it as date/time not just plain text
# .resample() requires the DataFrame to have a DatetimeIndex.
# Use .set_index('Date') before resampling.
df = df.set_index('Date')
df_upsampled = df.resample('D').ffill()
df_upsampled


df.reset_index(inplace=True)
df['rolling_average'] = df['Value'].rolling(window=3).mean()  # compute 3-day moving average 
df

Unnamed: 0,Date,Value,rolling_average
0,2022-01-01,10,
1,2023-01-01,20,
2,2024-01-01,30,20.0


In [None]:
## Pivot Tables & Crosstab in Pandas
# Pivot tables let you group, aggregate, and reshape data easily.
# same like `SELECT City, AVG(Score) FROM Table GROUP BY City;`

# Basic Pivot Table
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
    'City': ['Cairo', 'London', 'Cairo', 'London', 'London'],
    'Score': [85, 90, 78, 95, 88]
}

df = pd.DataFrame(data)
pivot = pd.pivot_table(df, index='City', values='Score', aggfunc='mean')  # apply average for each score group by city
pivot.reset_index(inplace=True)  # reset index to 0, 1, ..

# Multiple Aggregations in Pivot Table
pivot = pd.pivot_table(df, index='City', values='Score', aggfunc=['mean', 'median', 'min', 'max'])
pivot

# Pivot with Multiple Indexes and Columns
pivot = pd.pivot_table(df, index='City', values='Score', columns='Name', aggfunc='mean', fill_value=0)  # apply mean to scores of each name in each city, Put the result in a pivot table, with cities as rows and names as columns

pivot = pd.pivot_table(df, index='City', values='Score', columns='Name', aggfunc=['mean', 'median', 'min', 'max'], fill_value=0)  # apply the aggregate functions to scores of each name in each city


# Crosstab → Frequency Table (like COUNT in SQL)
pd.crosstab(df['City'], df['Name'])   # shows how many times each Name appears in each City



Name,Alice,Bob,Charlie
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cairo,1,0,1
London,1,2,0
