# Day 5 - Pandas: Data Manipulation II
Advanced reshaping, function application, mapping, and combining DataFrames.

## Load Superstore Data and Clean Columns

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('superstore.csv', encoding='ISO-8859-1')

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
df.head()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


## Task 1: Pivot, Pivot Table, and Melt

In [2]:
# Pivot Table
pivot = df.pivot_table(values='sales', index='region', columns='category', aggfunc='sum')
print("Pivot Table - Sales by Region and Category:")
print(pivot)

# Melt
melted = df.melt(id_vars=['order_id'], value_vars=['sales', 'profit'], var_name='metric', value_name='amount')
print("Melted version:")
print(melted.head())

Pivot Table - Sales by Region and Category:
category    Furniture  Office Supplies  Technology
region                                            
Central   163797.1638       167026.415  170416.312
East      208291.2040       205516.055  264973.981
South     117298.6840       125651.313  148771.908
West      252612.7435       220853.249  251991.832
Melted version:
         order_id metric    amount
0  CA-2016-152156  sales  261.9600
1  CA-2016-152156  sales  731.9400
2  CA-2016-138688  sales   14.6200
3  US-2015-108966  sales  957.5775
4  US-2015-108966  sales   22.3680


## Task 2: Apply Custom Function

In [3]:
# Add profit margin column
df['profit_margin'] = df.apply(lambda row: row['profit'] / row['sales'] if row['sales'] != 0 else 0, axis=1)

# Classify margins
df['margin_category'] = df['profit_margin'].apply(lambda x: 'High' if x > 0.3 else 'Medium' if x > 0 else 'Low')
df[['sales', 'profit', 'profit_margin', 'margin_category']].head()

Unnamed: 0,sales,profit,profit_margin,margin_category
0,261.96,41.9136,0.16,Medium
1,731.94,219.582,0.3,Medium
2,14.62,6.8714,0.47,High
3,957.5775,-383.031,-0.4,Low
4,22.368,2.5164,0.1125,Medium


## Task 3: Replace and Map

In [4]:
# Replace and map
df['segment'] = df['segment'].replace('Consumer', 'Retail')
df['country_code'] = df['country'].map({'United States': 'US', 'Canada': 'CA'})
df[['country', 'country_code']].drop_duplicates().head()

Unnamed: 0,country,country_code
0,United States,US


## Task 4: Concatenate DataFrames

In [5]:
# Concatenate examples
df1 = df.head(3)
df2 = df.tail(3)

# Vertical
vertical_concat = pd.concat([df1, df2], ignore_index=True)

# Horizontal
horizontal_concat = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)

print("Vertical Concat:")
print(vertical_concat)
print("Horizontal Concat:")
print(horizontal_concat)

Vertical Concat:
   row_id        order_id order_date   ship_date       ship_mode customer_id  \
0       1  CA-2016-152156  11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156  11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688  6/12/2016   6/16/2016    Second Class    DV-13045   
3    9992  CA-2017-121258  2/26/2017    3/3/2017  Standard Class    DB-13060   
4    9993  CA-2017-121258  2/26/2017    3/3/2017  Standard Class    DB-13060   
5    9994  CA-2017-119914   5/4/2017    5/9/2017    Second Class    CC-12220   

     customer_name    segment        country         city  ...  \
0      Claire Gute     Retail  United States    Henderson  ...   
1      Claire Gute     Retail  United States    Henderson  ...   
2  Darrin Van Huff  Corporate  United States  Los Angeles  ...   
3      Dave Brooks     Retail  United States   Costa Mesa  ...   
4      Dave Brooks     Retail  United States   Costa Mesa  ...   
5     Chris Cortes     Ret

## Task 5: Mini Pipeline Exercise

In [6]:
# Filter data
filtered = df[df['sales'] > 100]

# Pivot
summary = filtered.pivot_table(values='profit', index='category', columns='region', aggfunc='sum')

# Profit level
filtered['profit_level'] = filtered['profit'].apply(lambda x: 'High' if x > 100 else 'Low')

print("Summary Pivot Table:")
print(summary)
filtered[['sales', 'profit', 'profit_level']].head()

Summary Pivot Table:
region              Central        East       South        West
category                                                       
Furniture         -909.3711   1179.5586   5757.4516   9891.6939
Office Supplies  10143.6988  31779.9877  15827.9737  39374.6039
Technology       32560.1056  45808.8592  18936.2155  42701.2529


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['profit_level'] = filtered['profit'].apply(lambda x: 'High' if x > 100 else 'Low')


Unnamed: 0,sales,profit,profit_level
0,261.96,41.9136,Low
1,731.94,219.582,High
3,957.5775,-383.031,Low
7,907.152,90.7152,Low
9,114.9,34.47,Low
