In [91]:
import pandas as pd

In [92]:
## MultiIndex & Hierarchical Data

# What is MultiIndex?
# MultiIndex = Index with multiple levels (rows or columns).
# It lets you work with nested groups.

arrays = [
    ['A', 'A', 'B', 'B'],
    ['cat', 'dog', 'cat', 'dog']
]

index = pd.MultiIndex.from_arrays(arrays, names=('Letter', 'Animal'))
df = pd.DataFrame({'Value': [5, 6, 7, 8]}, index=index)  # Now you have two index levels: Letter and Animal.
# df = pd.DataFrame({'Value': [5, 6, 7, 8], 'sec_value': [1, 2, 3, 4]}, index=index)
df

# Accessing MultiIndex Data
df.loc['A']
df.loc[('A', 'dog')]  # MultiIndex uses tuples when specifying multiple levels.

# resetting multi index
# df_reset = df.reset_index()
# df_reset   # Converts MultiIndex back into flat DataFrame, (letter, animal) will be regular columns

# Stacking & Unstacking
# “pivot” is Changing the shape of the table → making rows become columns or columns become rows.

# Unstack → Rows → Columns
df_unstacked = df.unstack()
# The second level of the index (Animal) has been pivoted into columns:
# cat → new column
# dog → new column
# 👉 The table is now wide format.
df_unstacked
df_unstacked[('Value', 'cat')]

# Stack → Columns → Rows (Reverse)
df_stacked = df_unstacked.stack(future_stack=True)  # Back to long format.
df_stacked


# convert flat DataFrame to MultiIndex
df = pd.DataFrame({
    'City': ['Cairo', 'London', 'Cairo'],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, 90, 78]
})

# This is flat: single index (0,1,2)

# If you set_index:
df2 = df.set_index(['City', 'Name'])

# Now you can stack/unstack because it's MultiIndex.


- Unstack → moves part of the row index → into columns (wide format)
- Stack → moves part of the columns → back into row index (long format)

👉 It’s like reshaping the data horizontally or vertically without changing the data itself.

| Function     | What it Does                               | Result |
| ------------ | ------------------------------------------ | ------ |
| `.unstack()` | Moves one row index level into **columns** | Wide   |
| `.stack()`   | Moves one column level into **rows**       | Long   |


In [93]:
## Apply Functions in Pandas

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, 90, 78]
})

# map() → Works on Series (one column)
df['Name'] = df['Name'].map(str.upper)
df['Score'] = df['Score'].map(lambda x: x * 2)

# map() on DataFrame (Cell by Cell)
df = df.map(lambda x: str(x).lower() if isinstance(x, str) else x)  # This applies the function to every single cell of the DataFrame

def add_symbol(x):
    return x + 10 if isinstance(x, (int, float)) else x
df = df.map(add_symbol)


# apply() → Works on Series OR DataFrame
# Apply on Series
df['Score'] = df['Score'].apply(lambda x: x * 2)

# Apply on DataFrame row
df['Score'] = df.apply(lambda row: row['Score'] + 50, axis=1)  # i have to use axis=1, to indicate that applying on rows

# Apply on Columns (One Function per Column) → axis=0 (default)
df.apply(lambda col: col[0])
df.apply(lambda col: col.mean() if col.dtype != 'O' else None)
df

Unnamed: 0,Name,Score
0,alice,410
1,bob,430
2,charlie,382


In [94]:
## Advanced GroupBy (Nested Grouping + Custom Aggregations)

data = {
    'Region': ['North', 'North', 'South', 'South', 'South'],
    'City': ['Cairo', 'Cairo', 'Aswan', 'Aswan', 'Luxor'],
    'Sales': [100, 150, 200, 250, 300]
}

df = pd.DataFrame(data)

# This creates a MultiIndex Series where:
    # Level 1 = Region
    # Level 2 = City
# → Each combination has an aggregated value.
df_grouped = df.groupby(['Region', 'City'])['Sales'].sum()
df_grouped_index_resetted = df.groupby(['Region', 'City'])['Sales'].sum().reset_index() 
df_grouped_index_resetted

# Applying Multiple Aggregations (agg)
df.groupby(['Region', 'City'])['Sales'].agg(['sum', 'min', 'max']).reset_index() 


# Applying Custom Functions in GroupBy
def my_range(series):
    return series.max() - series.min()

df.groupby('Region')['Sales'].agg(['sum', 'mean', my_range])

# You can also give it a custom name
result = df.groupby('Region')['Sales'].agg(sum='sum', avg='mean', range=my_range)



In [95]:
## Working with Text Data (.str functions)
# Pandas provides powerful string handling functions via .str that work on Series containing text.

df = pd.DataFrame({
    'Name': [' Alice ', 'BOB', 'CharLie', None],
    'City': ['Cairo', 'London', 'new york', 'Paris']
})

df['Name'] = df['Name'].str.strip()        # Remove spaces
df['Name'] = df['Name'].str.lower()        # To lowercase
df['Name'] = df['Name'].str.upper()        # To uppercase
df['City'] = df['City'].str.lower()        # convert to lowercase
df['City'] = df['City'].str.title()        # Capitalize each word

# String Searching & Matching
# shows True/False values for each row on the column that matching the condition
name = df['Name'].str.contains('ali', case=False, na=False)  # Case insensitive , not null
name = df['Name'].str.startswith('A')  
city = df['City'].str.endswith('n')
# shows the row itself that matching the condition
df[df['City'].str.startswith('C')]

# String Replace & Extract

# regex=False (literal string)
df['City'] = df['City'].str.replace('new york', 'NEW YORK', case=False, regex=False)  

# regex=True (regex pattern)
df['City'] = df['City'].str.replace('new.*', 'NEW YORK', case=False, regex=True)
# if you set regex=True and old='new.*', the replacement will occur for strings like 'new york', 'new jersey', 'new mexico', etc. because the regex pattern new.* matches any string that starts with 'new' followed by any characters.
# What happens when regex=False? If regex=False (which is the default), the old parameter is treated as a literal string. This means that the replacement will occur only if the string matches the exact literal string.


# The .str.extract() method is used to extract substrings from a pandas Series (in this case, df['Name']) using regular expressions.
# The expand=False parameter means that the extracted values will be returned as a Series with a single column, rather than expanding the result into multiple columns.
df['initial'] = df['Name'].str.extract(r'(^[A-Z])', expand=False)

# Extracting multiple substrings
df = pd.DataFrame({
        'Name': ['John Smith', 'Alice Johnson', 'Bob Davis', 'John Cena'], 
        'City': ['Cairo', 'London', 'new york', 'Paris']
    })

# Extract first and last names using regular expressions
# The regular expression pattern r'([A-Z][a-z]+)\s+([A-Z][a-z]+)' is applied to each string in df['Name'].
# The pattern consists of two capture groups:
# ([A-Z][a-z]+) matches the first name (one uppercase letter followed by one or more lowercase letters).
# ([A-Z][a-z]+) matches the last name (one uppercase letter followed by one or more lowercase letters).
# The \s+ matches one or more whitespace characters between the first and last names.
# The expand=True parameter returns a DataFrame with multiple columns, where each column corresponds to a capture group in the regular expression pattern.
# If a string does not match the regular expression pattern, the resulting values will be NaN (Not a Number).
df[['First Name', 'Last Name']] = df['Name'].str.extract(r'([A-Z][a-z]+)\s+([A-Z][a-z]+)', expand=True)

df['Name_split'] = df['Name'].str.split()   # return array
df['Name'].str.split().str[0]   # return series of the first word of each array

# Joining text:
df['Full'] = df['Name'] + ' from ' + df['City']
df


Unnamed: 0,Name,City,First Name,Last Name,Name_split,Full
0,John Smith,Cairo,John,Smith,"[John, Smith]",John Smith from Cairo
1,Alice Johnson,London,Alice,Johnson,"[Alice, Johnson]",Alice Johnson from London
2,Bob Davis,new york,Bob,Davis,"[Bob, Davis]",Bob Davis from new york
3,John Cena,Paris,John,Cena,"[John, Cena]",John Cena from Paris


In [96]:
## Categorical Data

"""
Goal: Optimize memory and speed when working with repeated string values.

What is Categorical Data?
    Regular object/string columns in Pandas take more memory and are slower when they have repetitive values.

You can convert them to category type to:
✅ Save memory
✅ Speed up filtering & grouping
"""

# Converting to Category
df = pd.DataFrame({
    'City': ['Cairo', 'London', 'Cairo', 'Paris', 'Cairo', 'London']
})

# Before: object dtype
df.dtypes

# convert to category
df['City'] = df['City'].astype('category')

# After: category dtype
df.dtypes

# Memory Comparison
"""
df.memory_usage(deep=True) is a pandas function that returns the memory usage of a DataFrame, including the memory usage of the values in the DataFrame.

Here's a breakdown of what it does:

df.memory_usage(): This function returns the memory usage of a DataFrame in bytes. It includes the memory usage of the index, columns, and data.
deep=True: This parameter tells pandas to also include the memory usage of the values in the DataFrame, not just the metadata (index, columns, etc.). This means that it will recursively calculate the memory usage of any object that is stored in the DataFrame, such as strings, lists, or other DataFrames.
By setting deep=True, you get a more accurate estimate of the total memory usage of the DataFrame, including the memory used by the data itself.
"""
df.memory_usage(deep=True)

# Working with Categorical Columns
df['City'].cat.categories   # See unique categories

# category codes
"""
- What are categorical codes?
The integer numbers assigned to each unique value of the category column which are not entirely random, but rather, they are assigned in a specific order based on the order of the unique values in the column.

When you create a categorical data type, pandas assigns a unique integer code to each unique value in the column, starting from 0. The codes are assigned in the order of the unique values, so the first unique value gets code 0, the second unique value gets code 1, and so on.

For example, if the 'City' column has the following unique values:

'New York'
'Chicago'
'San Francisco'
The categorical codes might be:

'New York': 0
'Chicago': 1
'San Francisco': 2
You can then use these codes to access the corresponding values in the column. For example, df['City'].cat.codes == 0 would select all rows where the 'City' column is 'New York'.

You can perform various operations on the categorical codes, such as:

Filtering: df[df['City'].cat.codes == 0]
Grouping: df.groupby(df['City'].cat.codes).sum()
Merging: pd.merge(df, other_df, on='City', how='left')


- Why are categorical codes used?

Categorical codes are used to:

Reduce memory usage: Storing categorical data as integers can be more memory-efficient than storing the original string values.
Improve performance: Categorical codes can be faster to process than string values in certain operations, such as grouping and aggregating.
Enable categorical operations: Categorical codes can be used to perform operations that are specific to categorical data, such as grouping and aggregating.

- How are categorical codes created?

Categorical codes are created when you convert a column to a categorical data type using the astype('category')
"""
df['City'].cat.codes


# rename categories
df['City'] = df['City'].cat.rename_categories({'Cairo': 'CAI', 'London': 'LDN', 'Paris': 'PAR'})

In [97]:
## Window Functions & Rolling
"""
Perform calculations over a moving window across your data (instead of over the entire dataset).

Instead of calculating one global value (like .mean()), you calculate it locally over rolling chunks of the data.

Useful for:

    Moving averages

    Running totals

    Rolling statistics (min, max, std, etc.)
"""

df = pd.DataFrame({
    'Date': pd.date_range(start='2024-01-01', periods=10, freq='D'),
    'Sales': [10, 20, 15, 30, 45, 40, 35, 50, 55, 60]
})

df = df.set_index('Date')
df['rolling_average'] = df['Sales'].rolling(window=3).mean()  
# For each row → take current value + previous 2 → compute mean.
# 👉 First two rows = NaN (not enough previous data).


df['expanding_mean'] = df['Sales'].expanding().mean()  # Useful for cumulative averages (each point sees all previous points).

"""
When you set window=3 and min_periods=2, you're telling pandas to:

- Calculate the rolling average using a window of 3 numbers (i.e., the current number and the 2 preceding numbers).
- But, if there are only 2 numbers available (i.e., the current number and only 1 preceding number), still calculate the average using those 2 numbers.

In other words, min_periods=2 overrides the window=3 requirement, allowing the calculation to proceed even if there aren't enough numbers to fill the entire window.

By doing so, you'll get a result for the rolling average even when there are only 2 numbers available, instead of getting NaN (Not a Number).
"""
df['rolling_average'] = df['Sales'].rolling(window=3, min_periods=2).mean()  

df


Unnamed: 0_level_0,Sales,rolling_average,expanding_mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,10,,10.0
2024-01-02,20,15.0,15.0
2024-01-03,15,15.0,15.0
2024-01-04,30,21.666667,18.75
2024-01-05,45,30.0,24.0
2024-01-06,40,38.333333,26.666667
2024-01-07,35,40.0,27.857143
2024-01-08,50,41.666667,30.625
2024-01-09,55,46.666667,33.333333
2024-01-10,60,55.0,36.0


In [None]:
## Performance & Optimization
# Write faster, more memory-efficient Pandas code, especially when dealing with big DataFrames.

# Memory Optimization

# 1. Use category for Repeated Text: Cuts memory when the same string appears often.
"""
In the context of pandas, "cuts memory" means that using categorical data types for repeated text can reduce the amount of memory required to store the data.

When you have a column with repeated text values, pandas stores each value as a separate string object in memory. This can lead to a significant amount of memory usage, especially if the column contains many repeated values.

By converting the column to a categorical data type, pandas can store the repeated values more efficiently. Here's how it works:

Unique values are stored separately: pandas stores the unique values in the column as a separate array, which is typically much smaller than the original column.
Integer codes are used to represent values: pandas assigns an integer code to each unique value in the array. These codes are used to represent the values in the original column.
Codes are stored in the original column: The integer codes are stored in the original column, replacing the original text values.

df['City'] = df['City'].astype('category')
"""


# 2. Downcast Numeric Types
"""
Downcasting refers to the process of converting a column's data type to a smaller, more memory-efficient type, without losing significant data precision. This is particularly useful when working with large datasets to reduce memory usage.

df['Sales'].dtypes  # int64

df['Sales'] = pd.to_numeric(df['Sales'], downcast='integer')  # Changes int64 → int8 / int16 to save space.

df.dtypes   # int8
"""


# 3. Check Memory Usage

"""
df.info(memory_usage='deep')

This line displays a concise summary of the DataFrame, including the memory usage of each column. The memory_usage='deep' parameter tells pandas to include the memory usage of the values in the DataFrame, not just the metadata (e.g., column names, data types).
"""

"""
df.memory_usage(deep=True)

This line returns a Series that shows the memory usage of each column in the DataFrame, including the memory usage of the values. The deep=True parameter tells pandas to include the memory usage of the values in the DataFrame, not just the metadata.
"""


# Speed Optimization

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'UnitPrice': [85, 90, 78],
    'Quantity': [5, 10, 10]
})

# 1. Vectorized Operations
"""
Always prefer:
`
    df['Price'] = df['Quantity'] * df['UnitPrice']  # Fast → no loops
`
instead of:
`
    for i in df.index:
        df.loc[i, 'Price'] = df.loc[i, 'Quantity'] * df.loc[i, 'UnitPrice']

`
and they have the same result
"""

# 2. Avoid .apply() unless really needed
"""
you can use:
`
    df['Quantity'] = df['Quantity'] + 10
`
instead of:
`
    df['Quantity'] = df['Quantity'].apply(lambda x: x + 10)

`
and they have the same result
"""

# 3. Use query() for Faster Filtering
df.query('Quantity > 5 and UnitPrice > 30')  # Faster and more readable than chaining multiple conditions with []


# Use Chunking for Very Large Files
"""
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
    process(chunk)
"""


# Parallelism with Dask (Optional)
"""
# Syntax is nearly identical to Pandas but runs in parallel.

import dask.dataframe as dd
ddf = dd.read_csv('very_large.csv')
"""

Unnamed: 0,Name,UnitPrice,Quantity
1,Bob,90,10
2,Charlie,78,10
