In [None]:
import pandas as pd

df = pd.DataFrame({
    'Department': ['Sales', 'Sales', 'HR', 'HR', 'IT'],
    'Month': ['Jan', 'Feb', 'Jan', 'Feb', 'Jan'],
    'Revenue': [200, 220, 180, 210, 190]
})

pivot = df.pivot_table(index='Department', columns='Month', values='Revenue', aggfunc='sum')
pivot


Month,Feb,Jan
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,210.0,180.0
IT,,190.0
Sales,220.0,200.0


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sqlite3

In [5]:
# ===== 1. PIVOT TABLES AND CROSS-TABULATION =====

# Sample sales data
sales_data = {
    'Date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02', '2024-01-03'],
    'Product': ['A', 'B', 'A', 'B', 'A'],
    'Region': ['North', 'South', 'North', 'South', 'North'],
    'Sales': [100, 150, 120, 180, 110],
    'Quantity': [10, 15, 12, 18, 11]
}
df_sales = pd.DataFrame(sales_data)

# Pivot table - summarizing sales by product and region
pivot_sales = df_sales.pivot_table(
    values='Sales', 
    index='Product', 
    columns='Region', 
    aggfunc='sum',
    fill_value=0
)
print("Pivot Table - Sales by Product and Region:")
print(pivot_sales)
print()

# Cross-tabulation - frequency counts
crosstab_result = pd.crosstab(df_sales['Product'], df_sales['Region'], margins=True)
print("Cross-tabulation with margins:")
print(crosstab_result)

Pivot Table - Sales by Product and Region:
Region   North  South
Product              
A          330      0
B            0    330

Cross-tabulation with margins:
Region   North  South  All
Product                   
A            3      0    3
B            0      2    2
All          3      2    5


In [7]:
# ===== 2. MULTI-INDEXING AND HIERARCHICAL INDEXING =====

# Creating MultiIndex DataFrame
arrays = [
    ['A', 'A', 'B', 'B'],
    ['X', 'Y', 'X', 'Y']
]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df_multi = pd.DataFrame(np.random.randn(4, 3), index=index, columns=['col1', 'col2', 'col3'])

print("MultiIndex DataFrame:")
print(df_multi)
print()

# Accessing multi-index data
print("Access level 'A':")
print(df_multi.loc['A'])
print()

# Stack and unstack operations
stacked = df_multi.stack()
print("Stacked DataFrame:")
print(stacked)

MultiIndex DataFrame:
                  col1      col2      col3
first second                              
A     X       0.461300 -0.295523 -0.670975
      Y       1.192192 -0.182910  0.492700
B     X      -0.940511  0.976762 -0.747362
      Y      -0.970677 -1.177551 -0.641383

Access level 'A':
            col1      col2      col3
second                              
X       0.461300 -0.295523 -0.670975
Y       1.192192 -0.182910  0.492700

Stacked DataFrame:
first  second      
A      X       col1    0.461300
               col2   -0.295523
               col3   -0.670975
       Y       col1    1.192192
               col2   -0.182910
               col3    0.492700
B      X       col1   -0.940511
               col2    0.976762
               col3   -0.747362
       Y       col1   -0.970677
               col2   -1.177551
               col3   -0.641383
dtype: float64


In [11]:
# ===== 3. APPLYING CUSTOM FUNCTIONS =====

# Sample dataset
df_students = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Math': [85, 78, 92, 88],
    'Science': [90, 85, 88, 95],
    'English': [78, 88, 85, 92]
})

# Using apply() for row-wise operations
def calculate_grade(row):
    avg = (row['Math'] + row['Science'] + row['English']) / 3
    if avg >= 90:
        return 'A'
    elif avg >= 80:
        return 'B'
    else:
        return 'C'

df_students['Grade'] = df_students.apply(calculate_grade, axis=1)
print("Students with grades (using apply):")
print(df_students)
print()

# Using map() for Series transformations
grade_points = {'A': 4.0, 'B': 3.0, 'C': 2.0}
df_students['GPA'] = df_students['Grade'].map(grade_points)
print("Students with GPA (using map):")
print(df_students[['Name', 'Grade', 'GPA']])
print()

# Using apply() for element-wise operations on DataFrame
df_numeric = df_students[['Math', 'Science', 'English']]
df_scaled = df_numeric.apply(lambda x: x / 100)
print("Scaled scores (using apply):")
print(df_scaled)

Students with grades (using apply):
      Name  Math  Science  English Grade
0    Alice    85       90       78     B
1      Bob    78       85       88     B
2  Charlie    92       88       85     B
3    Diana    88       95       92     A

Students with GPA (using map):
      Name Grade  GPA
0    Alice     B  3.0
1      Bob     B  3.0
2  Charlie     B  3.0
3    Diana     A  4.0

Scaled scores (using apply):
   Math  Science  English
0  0.85     0.90     0.78
1  0.78     0.85     0.88
2  0.92     0.88     0.85
3  0.88     0.95     0.92


In [12]:

# ===== 4. EFFICIENT MERGING AND JOINING =====

# Sample datasets
df_customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'city': ['NYC', 'LA', 'Chicago', 'NYC', 'Boston']
})

df_orders = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105, 106],
    'customer_id': [1, 2, 2, 3, 1, 6],  # Note: customer_id 6 doesn't exist in customers
    'amount': [100, 150, 200, 120, 80, 300]
})

# Inner join (default)
inner_join = pd.merge(df_customers, df_orders, on='customer_id')
print("Inner join:")
print(inner_join)
print()

# Left join to keep all customers
left_join = pd.merge(df_customers, df_orders, on='customer_id', how='left')
print("Left join:")
print(left_join)
print()

# Handling duplicates with suffixes
df_orders_dup = df_orders.copy()
df_orders_dup['name'] = ['Order1', 'Order2', 'Order3', 'Order4', 'Order5', 'Order6']
merged_with_suffix = pd.merge(df_customers, df_orders_dup, on='customer_id', suffixes=('_customer', '_order'))
print("Merge with suffixes:")
print(merged_with_suffix.head())
print()


Inner join:
   customer_id     name     city  order_id  amount
0            1    Alice      NYC       101     100
1            1    Alice      NYC       105      80
2            2      Bob       LA       102     150
3            2      Bob       LA       103     200
4            3  Charlie  Chicago       104     120

Left join:
   customer_id     name     city  order_id  amount
0            1    Alice      NYC     101.0   100.0
1            1    Alice      NYC     105.0    80.0
2            2      Bob       LA     102.0   150.0
3            2      Bob       LA     103.0   200.0
4            3  Charlie  Chicago     104.0   120.0
5            4    Diana      NYC       NaN     NaN
6            5      Eve   Boston       NaN     NaN

Merge with suffixes:
   customer_id name_customer     city  order_id  amount name_order
0            1         Alice      NYC       101     100     Order1
1            1         Alice      NYC       105      80     Order5
2            2           Bob       LA  

In [16]:

# ===== 5. HANDLING MISSING DATA =====

# Create dataset with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5, np.nan, 7],
    'B': [10, np.nan, 30, 40, np.nan, 60, 70],
    'C': [100, 200, 300, np.nan, 500, 600, 700]
})

print("Original data with missing values:")
print(df_missing)
print()

# Forward fill
df_ffill = df_missing.ffill()
print("Forward fill:")
print(df_ffill)
print()

# Backward fill
df_bfill = df_missing.bfill()
print("Backward fill:")
print(df_bfill)
print()

# Interpolation
df_interp = df_missing.interpolate()
print("Linear interpolation:")
print(df_interp)

Original data with missing values:
     A     B      C
0  1.0  10.0  100.0
1  2.0   NaN  200.0
2  NaN  30.0  300.0
3  4.0  40.0    NaN
4  5.0   NaN  500.0
5  NaN  60.0  600.0
6  7.0  70.0  700.0

Forward fill:
     A     B      C
0  1.0  10.0  100.0
1  2.0  10.0  200.0
2  2.0  30.0  300.0
3  4.0  40.0  300.0
4  5.0  40.0  500.0
5  5.0  60.0  600.0
6  7.0  70.0  700.0

Backward fill:
     A     B      C
0  1.0  10.0  100.0
1  2.0  30.0  200.0
2  4.0  30.0  300.0
3  4.0  40.0  500.0
4  5.0  60.0  500.0
5  7.0  60.0  600.0
6  7.0  70.0  700.0

Linear interpolation:
     A     B      C
0  1.0  10.0  100.0
1  2.0  20.0  200.0
2  3.0  30.0  300.0
3  4.0  40.0  400.0
4  5.0  50.0  500.0
5  6.0  60.0  600.0
6  7.0  70.0  700.0


In [24]:

# ===== 6. PIVOT AND MELT =====

# Wide format data
df_wide = pd.DataFrame({
    'ID': [1, 2, 3],
    'Jan': [100, 150, 200],
    'Feb': [110, 160, 210],
    'Mar': [120, 170, 220]
})

print(df_wide)
print()

# Melt to long format
df_long = pd.melt(df_wide, id_vars=['ID'], var_name='Month', value_name='Sales')
print("Melted data (wide to long):")
print(df_long)
print()

# Pivot back to wide format
df_pivot_back = df_long.pivot(index='ID', columns='Month', values='Sales')
print("Pivoted back (long to wide):")
print(df_pivot_back)

   ID  Jan  Feb  Mar
0   1  100  110  120
1   2  150  160  170
2   3  200  210  220

Melted data (wide to long):
   ID Month  Sales
0   1   Jan    100
1   2   Jan    150
2   3   Jan    200
3   1   Feb    110
4   2   Feb    160
5   3   Feb    210
6   1   Mar    120
7   2   Mar    170
8   3   Mar    220

Pivoted back (long to wide):
Month  Feb  Jan  Mar
ID                  
1      110  100  120
2      160  150  170
3      210  200  220


In [25]:

# ===== 7. DATA AGGREGATION =====

# Sample transaction data
df_transactions = pd.DataFrame({
    'customer_id': [1, 1, 2, 2, 3, 3, 1],
    'product': ['A', 'B', 'A', 'C', 'B', 'A', 'C'],
    'amount': [100, 150, 200, 120, 180, 90, 110],
    'quantity': [1, 2, 3, 1, 2, 1, 1]
})

# Custom aggregation functions
def custom_agg(group):
    return pd.Series({
        'total_amount': group['amount'].sum(),
        'avg_amount': group['amount'].mean(),
        'total_quantity': group['quantity'].sum(),
        'unique_products': group['product'].nunique()
    })

customer_summary = df_transactions.groupby('customer_id').apply(custom_agg)
print("Custom aggregation by customer:")
print(customer_summary)
print()

# Multiple aggregations
agg_multiple = df_transactions.groupby('customer_id').agg({
    'amount': ['sum', 'mean', 'count'],
    'quantity': 'sum',
    'product': 'nunique'
})
print("Multiple aggregations:")
print(agg_multiple)
print()


Custom aggregation by customer:
             total_amount  avg_amount  total_quantity  unique_products
customer_id                                                           
1                   360.0       120.0             4.0              3.0
2                   320.0       160.0             4.0              2.0
3                   270.0       135.0             3.0              2.0

Multiple aggregations:
            amount              quantity product
               sum   mean count      sum nunique
customer_id                                     
1              360  120.0     3        4       3
2              320  160.0     2        4       2
3              270  135.0     2        3       2



  customer_summary = df_transactions.groupby('customer_id').apply(custom_agg)


In [26]:

# ===== 8. TIME SERIES OPERATIONS =====

# Create time series data
dates = pd.date_range('2024-01-01', periods=100, freq='D')
df_ts = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(100).cumsum() + 100
})
df_ts.set_index('date', inplace=True)

# Resampling - daily to weekly
weekly_data = df_ts.resample('W').mean()
print("Weekly resampled data:")
print(weekly_data.head())
print()

# Rolling window
df_ts['rolling_mean'] = df_ts['value'].rolling(window=7).mean()
print("Rolling 7-day mean:")
print(df_ts[['value', 'rolling_mean']].head(10))
print()

# Expanding window
df_ts['expanding_mean'] = df_ts['value'].expanding().mean()
print("Expanding mean:")
print(df_ts[['value', 'expanding_mean']].head(10))
print()

Weekly resampled data:
                 value
date                  
2024-01-07  100.477451
2024-01-14   99.525826
2024-01-21   96.796580
2024-01-28   94.857637
2024-02-04   94.654703

Rolling 7-day mean:
                 value  rolling_mean
date                                
2024-01-01  101.070912           NaN
2024-01-02   99.856025           NaN
2024-01-03  100.085406           NaN
2024-01-04  100.947886           NaN
2024-01-05  101.211319           NaN
2024-01-06   99.973992           NaN
2024-01-07  100.196613    100.477451
2024-01-08  100.225980    100.356746
2024-01-09   99.723389    100.337798
2024-01-10   99.278046    100.222461

Expanding mean:
                 value  expanding_mean
date                                  
2024-01-01  101.070912      101.070912
2024-01-02   99.856025      100.463469
2024-01-03  100.085406      100.337448
2024-01-04  100.947886      100.490057
2024-01-05  101.211319      100.634310
2024-01-06   99.973992      100.524257
2024-01-07  100.196613

In [27]:

# ===== 9. COMBINING DATAFRAMES =====

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
df3 = pd.DataFrame({'C': [9, 10], 'D': [11, 12]})

# Concatenation
concat_result = pd.concat([df1, df2], ignore_index=True)
print("Concatenation (vertical):")
print(concat_result)
print()

# Concatenation (horizontal)
concat_horizontal = pd.concat([df1, df3], axis=1)
print("Concatenation (horizontal):")
print(concat_horizontal)
print()

Concatenation (vertical):
   A  B
0  1  3
1  2  4
2  5  7
3  6  8

Concatenation (horizontal):
   A  B   C   D
0  1  3   9  11
1  2  4  10  12



In [28]:

# ===== 10. PERFORMANCE OPTIMIZATION =====

# Efficient data types
df_optimize = pd.DataFrame({
    'int_col': [1, 2, 3, 4, 5],
    'float_col': [1.1, 2.2, 3.3, 4.4, 5.5],
    'str_col': ['A', 'B', 'C', 'D', 'E']
})

# Convert to more efficient types
df_optimize['int_col'] = df_optimize['int_col'].astype('int8')
df_optimize['str_col'] = df_optimize['str_col'].astype('category')

print("Memory usage comparison:")
print("Original types:", df_optimize.dtypes)
print("Memory usage:", df_optimize.memory_usage(deep=True))
print()

# Vectorized operations (faster than loops)
large_df = pd.DataFrame({'A': np.random.randn(1000000)})

# Vectorized operation
large_df['B'] = large_df['A'] * 2 + 1
print("Vectorized operation completed on 1M rows")
print()

Memory usage comparison:
Original types: int_col          int8
float_col     float64
str_col      category
dtype: object
Memory usage: Index        132
int_col        5
float_col     40
str_col      427
dtype: int64

Vectorized operation completed on 1M rows



In [None]:


# ===== 11. ENCODING FOR ML =====

# Sample categorical data
df_categorical = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red', 'blue'],
    'size': ['small', 'medium', 'large', 'small', 'large'],
    'price': [10, 20, 30, 15, 25]
})

# One-hot encoding
df_onehot = pd.get_dummies(df_categorical, columns=['color', 'size'], prefix=['color', 'size'])
print("One-hot encoded data:")
print(df_onehot)
print()

# Label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_categorical['color_encoded'] = le.fit_transform(df_categorical['color'])
print("Label encoded data:")
print(df_categorical[['color', 'color_encoded']])
print()

# ===== 12. ADVANCED INDEXING WITH QUERY =====

# Sample data for querying
df_query = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000, 60000, 70000, 55000, 65000],
    'department': ['IT', 'HR', 'IT', 'Finance', 'IT']
})

# Using query for filtering
it_employees = df_query.query('department == "IT" and age > 30')
print("IT employees over 30:")
print(it_employees)
print()

# Complex query with variables
min_salary = 55000
high_earners = df_query.query('salary > @min_salary and age < 35')
print("High earners under 35:")
print(high_earners)
print()

# ===== 13. SQL INTEGRATION =====

# Create in-memory SQLite database
conn = sqlite3.connect(':memory:')

# Write DataFrame to SQL
df_customers.to_sql('customers', conn, index=False, if_exists='replace')
df_orders.to_sql('orders', conn, index=False, if_exists='replace')

# Read from SQL with custom query
sql_query = """
SELECT c.name, c.city, SUM(o.amount) as total_spent
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.city
"""

df_from_sql = pd.read_sql_query(sql_query, conn)
print("Data from SQL query:")
print(df_from_sql)
print()

conn.close()

# ===== 14. WINDOW FUNCTIONS =====

# Sample stock data
df_stock = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=10),
    'price': [100, 102, 98, 105, 108, 106, 110, 107, 112, 115]
})

# Expanding window for cumulative statistics
df_stock['cumulative_max'] = df_stock['price'].expanding().max()
df_stock['cumulative_min'] = df_stock['price'].expanding().min()

# Exponentially weighted moving average
df_stock['ewm'] = df_stock['price'].ewm(span=5).mean()

print("Stock data with window functions:")
print(df_stock)
print()

# ===== 15. SERIALIZATION =====

# Save DataFrame to pickle
df_students.to_pickle('students.pkl')
print("DataFrame saved to pickle")

# Load DataFrame from pickle
df_loaded = pd.read_pickle('students.pkl')
print("DataFrame loaded from pickle:")
print(df_loaded)
print()

# ===== 16. SETTINGWITHCOPYWARNING AND .loc[] =====

# Create sample data
df_copy = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# WRONG way (may cause SettingWithCopyWarning)
# df_subset = df_copy[df_copy['A'] > 2]
# df_subset['C'] = df_subset['A'] * 2  # This might cause warning

# CORRECT way using .loc[]
df_copy.loc[df_copy['A'] > 2, 'C'] = df_copy.loc[df_copy['A'] > 2, 'A'] * 2

print("Properly modified DataFrame using .loc[]:")
print(df_copy)
print()

# ===== 17. STATISTICAL FUNCTIONS =====

# Sample dataset for statistics
df_stats = pd.DataFrame({
    'A': np.random.normal(100, 15, 1000),
    'B': np.random.normal(50, 10, 1000),
    'C': np.random.exponential(2, 1000)
})

# Comprehensive statistics
stats_summary = df_stats.describe()
print("Comprehensive statistics:")
print(stats_summary)
print()

# Custom percentiles
custom_percentiles = df_stats.describe(percentiles=[.1, .25, .5, .75, .9])
print("Custom percentiles:")
print(custom_percentiles)
print()

# Correlation matrix
correlation_matrix = df_stats.corr()
print("Correlation matrix:")
print(correlation_matrix)
print()

# Skewness and kurtosis
print("Skewness:")
print(df_stats.skew())
print()

print("Kurtosis:")
print(df_stats.kurtosis())
print()

print("=== ADVANCED PANDAS CONCEPTS DEMONSTRATED ===")
print("This comprehensive example covers all major advanced pandas concepts")
print("including pivot tables, multi-indexing, custom functions, merging,")
print("missing data handling, time series operations, and performance optimization.")

Pivot Table - Sales by Product and Region:
Region   North  South
Product              
A          330      0
B            0    330

Cross-tabulation with margins:
Region   North  South  All
Product                   
A            3      0    3
B            0      2    2
All          3      2    5

MultiIndex DataFrame:
                  col1      col2      col3
first second                              
A     X      -0.921914 -0.112575 -0.524333
      Y      -0.529446 -0.838279  0.248345
B     X      -0.456554  0.641452 -0.970105
      Y      -0.203389  0.401462 -0.620252

Access level 'A':
            col1      col2      col3
second                              
X      -0.921914 -0.112575 -0.524333
Y      -0.529446 -0.838279  0.248345

Stacked DataFrame:
first  second      
A      X       col1   -0.921914
               col2   -0.112575
               col3   -0.524333
       Y       col1   -0.529446
               col2   -0.838279
dtype: float64

Students with grades (using apply):
  

  df_scaled = df_numeric.applymap(lambda x: x / 100)
  df_ffill = df_missing.fillna(method='ffill')
  df_bfill = df_missing.fillna(method='bfill')
  customer_summary = df_transactions.groupby('customer_id').apply(custom_agg)


Label encoded data:
   color  color_encoded
0    red              2
1   blue              0
2  green              1
3    red              2
4   blue              0

IT employees over 30:
      name  age  salary department
2  Charlie   35   70000         IT
4      Eve   32   65000         IT

High earners under 35:
  name  age  salary department
1  Bob   30   60000         HR
4  Eve   32   65000         IT

Data from SQL query:
      name     city  total_spent
0    Alice      NYC        180.0
1      Bob       LA        350.0
2  Charlie  Chicago        120.0
3    Diana      NYC          NaN
4      Eve   Boston          NaN

Stock data with window functions:
        date  price  cumulative_max  cumulative_min         ewm
0 2024-01-01    100           100.0           100.0  100.000000
1 2024-01-02    102           102.0           100.0  101.200000
2 2024-01-03     98           102.0            98.0   99.684211
3 2024-01-04    105           105.0            98.0  101.892308
4 2024-01-05    