# Day 4 - Pandas: Data Manipulation I (Superstore Dataset)
Working with real-world retail data to access, filter, clean, group, and merge.

## Import Libraries and Load Dataset

In [22]:
import pandas as pd

# Load the renamed Superstore CSV
df = pd.read_csv('superstore.csv', encoding='ISO-8859-1')

# Clean column names for consistency
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')

# Preview data
print(df.head())

   row_id        order_id  order_date   ship_date       ship_mode customer_id  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     customer_name    segment        country             city  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  postal_code  region       product_id         category sub_category  \
0       42420   Sout

## Task 1: Indexing and Slicing

In [23]:
# Access using loc and iloc
print(df.loc[0, 'sales'])
print(df.iloc[0:3, 3:6])

# Select rows and columns
print(df[['region', 'sales', 'profit']])
print(df[df['sales'] > 500])


261.96
    ship_date     ship_mode customer_id
0  11/11/2016  Second Class    CG-12520
1  11/11/2016  Second Class    CG-12520
2   6/16/2016  Second Class    DV-13045
     region     sales    profit
0     South  261.9600   41.9136
1     South  731.9400  219.5820
2      West   14.6200    6.8714
3     South  957.5775 -383.0310
4     South   22.3680    2.5164
...     ...       ...       ...
9989  South   25.2480    4.1028
9990   West   91.9600   15.6332
9991   West  258.5760   19.3932
9992   West   29.6000   13.3200
9993   West  243.1600   72.9480

[9994 rows x 3 columns]
      row_id        order_id  order_date   ship_date       ship_mode  \
1          2  CA-2016-152156   11/8/2016  11/11/2016    Second Class   
3          4  US-2015-108966  10/11/2015  10/18/2015  Standard Class   
7          8  CA-2014-115812    6/9/2014   6/14/2014  Standard Class   
10        11  CA-2014-115812    6/9/2014   6/14/2014  Standard Class   
11        12  CA-2014-115812    6/9/2014   6/14/2014  Standard C

## Task 2: Filtering and Sorting

In [24]:
# Filter rows with sales > 500 and region is 'South'
filtered = df[(df['sales'] > 500) & (df['region'] == 'South')]
print(filtered[['order_id', 'sales', 'region']])

# Sort by Profit and then Category
df.sort_by_profit = df.sort_values('profit', ascending=False)
df.sort_by_category_sales = df.sort_values(by=['category', 'sales'])
print(df.sort_by_profit.head())
print(df.sort_by_category_sales.head())


            order_id      sales region
1     CA-2016-152156   731.9400  South
3     US-2015-108966   957.5775  South
72    US-2015-134026   831.9360  South
182   CA-2014-158274   503.9600  South
232   US-2017-100930   620.6145  South
...              ...        ...    ...
9639  CA-2015-116638  4297.6440  South
9689  US-2017-135986   503.9600  South
9732  CA-2014-114321   500.2400  South
9734  CA-2014-114321   896.9900  South
9816  CA-2015-162201   516.9600  South

[199 rows x 3 columns]
      row_id        order_id  order_date   ship_date       ship_mode  \
6826    6827  CA-2016-118689   10/2/2016   10/9/2016  Standard Class   
8153    8154  CA-2017-140151   3/23/2017   3/25/2017     First Class   
4190    4191  CA-2017-166709  11/17/2017  11/22/2017  Standard Class   
9039    9040  CA-2016-117121  12/17/2016  12/21/2016  Standard Class   
4098    4099  CA-2014-116904   9/23/2014   9/28/2014  Standard Class   

     customer_id  customer_name    segment        country         city  ...

  df.sort_by_profit = df.sort_values('profit', ascending=False)
  df.sort_by_category_sales = df.sort_values(by=['category', 'sales'])


## Task 3: Handling Missing Data

In [25]:
# Check for missing values
print(df.isnull().sum())

# Fill or drop missing values
df_filled = df.fillna(0)
df_dropped = df.dropna()

# Show filled data example
print(df_filled.head())


row_id           0
order_id         0
order_date       0
ship_date        0
ship_mode        0
customer_id      0
customer_name    0
segment          0
country          0
city             0
state            0
postal_code      0
region           0
product_id       0
category         0
sub_category     0
product_name     0
sales            0
quantity         0
discount         0
profit           0
dtype: int64
   row_id        order_id  order_date   ship_date       ship_mode customer_id  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     customer_name    segment        country             city  ...  \
0      Claire Gute   Consumer  

## Task 4: GroupBy Operations

In [26]:
# Average sales by category
print(df.groupby('category')['sales'].mean())

# Aggregation by region
region_stats = df.groupby('region')['sales'].agg(['sum', 'mean', 'count']).reset_index()
print(region_stats)


category
Furniture          349.834887
Office Supplies    119.324101
Technology         452.709276
Name: sales, dtype: float64
    region          sum        mean  count
0  Central  501239.8908  215.772661   2323
1     East  678781.2400  238.336110   2848
2    South  391721.9050  241.803645   1620
3     West  725457.8245  226.493233   3203


## Task 5: Merging Example

In [27]:
# Create dummy customer and orders tables
orders = df[['order_id', 'customer_id']]
customers = df[['customer_id', 'customer_name']].drop_duplicates()

# Merge the two
merged = pd.merge(orders, customers, on='customer_id', how='inner')
print(merged.head())


         order_id customer_id    customer_name
0  CA-2016-152156    CG-12520      Claire Gute
1  CA-2016-152156    CG-12520      Claire Gute
2  CA-2016-138688    DV-13045  Darrin Van Huff
3  US-2015-108966    SO-20335   Sean O'Donnell
4  US-2015-108966    SO-20335   Sean O'Donnell
