# Global Superstore 2016 - Data Cleaning
**Project:** Global Superstore Sales Analysis

**Objective:** Clean messy dataset and prepare for SQL, Tableau, and Power BI analysis

**Input:** superstore_dirty.csv (52,315 rows with quality issues)

In [48]:
from google.colab import drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
# Load dirty dataset
df = pd.read_csv('/content/drive/MyDrive/Superstore Project/superstore_dirty.csv')

print("✅ Dirty data loaded!")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {len(df.columns)}")

✅ Dirty data loaded!
   Rows: 52,315
   Columns: 24


## PHASE 1: DATA INSPECTION

In [50]:
# Data Inspection
print("="*80)
print("PHASE 1: DATA INSPECTION")
print("="*80)

# Main Info
print("Dataset Shape: " + str(df.shape))
print("Rows: " + str(df.shape[0]))
print("Columns: " + str(df.shape[1]))

# Column names
print()
for col in df.columns:
    print(f"   {col}")

PHASE 1: DATA INSPECTION
Dataset Shape: (52315, 24)
Rows: 52315
Columns: 24

   Row ID
   Order ID
   Order Date
   Ship Date
   Ship Mode
   Customer ID
   Customer Name
   Segment
   Postal Code
   City
   State
   Country
   Region
   Market
   Product ID
   Category
   Sub-Category
   Product Name
   Sales
   Quantity
   Discount
   Profit
   Shipping Cost
   Order Priority


In [51]:
# Data types
print()
print(df.dtypes)


Row ID              int64
Order ID           object
Order Date         object
Ship Date          object
Ship Mode          object
Customer ID        object
Customer Name      object
Segment            object
Postal Code       float64
City               object
State              object
Country            object
Region             object
Market             object
Product ID         object
Category           object
Sub-Category       object
Product Name       object
Sales              object
Quantity          float64
Discount           object
Profit             object
Shipping Cost      object
Order Priority     object
dtype: object


In [52]:
# First 10 rows
print()
df.head()




Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Postal Code,City,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,31595,CA-2014-JD15895140-41941,2014-10-29,2014-11-04,Standard Class,JD-158951406,Jonathan Doherty,Corporate,7109.0,Belleville,...,OFF-PA-6587,Office Supplies,Paper,Xerox 223,$32.40,5.0,0.0,$15.55,$2.25,Medium
1,46478,NI-2014-MM805595-41730,2014-04-01,2014-04-06,Standard Class,MM-805595,Michelle Moray,Consumer,,Jos,...,OFF-LA-5383,Office Supplies,Labels,"Novimex File Folder Labels, Adjustable",$1.84,1.0,70%,$-3.08,$1.17,High
2,11776,ES-2015-RP19855139-42063,2015-02-28,2015-03-02,Second Class,RP-19855139,Roy Phan,Corporate,,Cardiff,...,OFF-EN-4904,Office Supplies,Envelopes,"Jiffy Business Envelopes, Security-Tint",$31.92,2.0,0.0,$1.86,$4.26,High
3,7093,MX-2015-CA1205582-42276,2015-09-29,2015-10-05,Std Class,CA-1205582,Cathy Armstrong,Home-Office,,León,...,TEC-PH-3798,Technology,Phones,"Cisco Office Telephone, VoIP",$107.40,2.0,0.0,$44.00,$10.39,Low
4,13151,ES-2015-FH1435048-42152,2015-05-28,2015-05-30,Second Class,FH-1435048,Fred Harton,consumer,,Flensburg,...,OFF-AR-3488,Office Supplies,Art,"Binney & Smith Markers, Blue",$120.45,5.0,0.0,$22.80,$12.83,High


In [53]:
# Statistical summary
print()
df.describe()




Unnamed: 0,Row ID,Postal Code,Quantity
count,52315.0,10197.0,52144.0
mean,25658.868565,55155.37158,3.40392
std,14802.776422,32050.960458,2.380708
min,1.0,1040.0,-14.0
25%,12852.5,23223.0,2.0
50%,25670.0,55901.0,3.0
75%,38471.5,90008.0,5.0
max,51290.0,99301.0,14.0


In [54]:
for col in sorted(df.columns.unique()):
    unique_vals = sorted(df[col].dropna().unique())
    print(f"Unique values: {len(df[col].unique())}")
    print(f"{col}: {unique_vals}")
    print()

Unique values: 3
Category: ['Furniture', 'Office Supplies', 'Technology']

Unique values: 5344
City: ['  Aachen  ', '  Aalst  ', '  Aba  ', '  Abadan  ', '  Abeokuta  ', '  Abha  ', '  Abidjan  ', '  Acarigua  ', '  Acayucan  ', '  Accra  ', '  Acireale  ', '  Acuña  ', '  Acámbaro  ', '  Ad Diwaniyah  ', '  Ad Diwem  ', '  Adana  ', '  Adelaide  ', '  Adiyaman  ', '  Agadir  ', '  Agra  ', '  Ahvaz  ', '  Aix-en-Provence  ', '  Ajmer  ', '  Akhisar  ', '  Akron  ', '  Al Fayyum  ', '  Al Hufuf  ', '  Al Khums  ', '  Al Muharraq  ', '  Albany  ', '  Albertville  ', '  Albuquerque  ', '  Albury  ', '  Alexandria  ', '  Algiers  ', '  Alicante  ', '  Alice Springs  ', '  Allentown  ', '  Almaty  ', '  Alphen aan den Rijn  ', '  Altamura  ', '  Amadora  ', '  Amarillo  ', '  Amatitlán  ', '  Ambattur  ', '  Amman  ', '  Amol  ', '  Amravati  ', '  Amritsar  ', '  Amstelveen  ', '  Amsterdam  ', '  Anaheim  ', '  Anantapur  ', '  Andijon  ', '  Andradina  ', '  Angarsk  ', '  Angers  ', ' 

##PHASE 2: Date Types fixing

In [55]:
df_clean = df.copy()

In [56]:
# 1) Dates -> datetime
df_clean['Order Date'] = pd.to_datetime(df_clean['Order Date'], errors='coerce')
df_clean['Ship Date'] = pd.to_datetime(df_clean['Ship Date'], errors='coerce')

# 2) Numerics -> numeric
num_cols = ['Sales','Discount','Profit','Shipping Cost', 'Quantity']
for col in num_cols:
    df_clean[col] = df_clean[col].astype('string').str.strip().str.replace(r'[\$,%]', '', regex=True).str.replace(',', '')
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# 3) Text columns trimmen (nur die wirklich text sind)
text_cols = ['Customer ID', 'Customer Name', 'Segment', 'City', 'State',
             'Country', 'Region', 'Market', 'Product ID', 'Product Name',
             'Category', 'Sub-Category', 'Ship Mode', 'Order Priority']
for col in text_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('string').str.strip()

# 4) Postal Code
df_clean['Postal Code'] = df_clean['Postal Code'].astype('string').str.replace(r'\.0$', '', regex=True).str.strip()

In [57]:
df_clean.dtypes

Unnamed: 0,0
Row ID,int64
Order ID,object
Order Date,datetime64[ns]
Ship Date,datetime64[ns]
Ship Mode,string[python]
Customer ID,string[python]
Customer Name,string[python]
Segment,string[python]
Postal Code,string[python]
City,string[python]


In [58]:
# Remove rows with missing Ship Mode (critical field)
print("\n📦 SHIP MODE VALIDATION:")
missing_ship_mode = df_clean['Ship Mode'].isna().sum()
print(f"   Rows with missing Ship Mode: {missing_ship_mode:,}")

if missing_ship_mode > 0:
    print(f"   ⚠️  Removing {missing_ship_mode:,} rows with missing Ship Mode (critical field)")
    df_clean = df_clean[df_clean['Ship Mode'].notna()].copy()
    print(f"   ✅ Dataset now has {len(df_clean):,} rows")
else:
    print(f"   ✅ All rows have Ship Mode")


📦 SHIP MODE VALIDATION:
   Rows with missing Ship Mode: 542
   ⚠️  Removing 542 rows with missing Ship Mode (critical field)
   ✅ Dataset now has 51,773 rows


##PHASE 3: Standardization of Names

In [59]:
import pandas as pd

# --- DISCOUNT: '70%' -> 0.70, '0.0' -> 0.0, '0%' -> 0.0 ---
df_clean['Discount'] = (
    df_clean['Discount'].astype('string').str.strip()
      .str.replace('%', '', regex=False)
)
df_clean['Discount'] = pd.to_numeric(df_clean['Discount'], errors='coerce')
# wenn Werte wie 70 statt 0.70 vorliegen -> auf 0-1 skalieren
df_clean.loc[df_clean['Discount'] > 1, 'Discount'] /= 100


# --- SEGMENT: Variationen vereinheitlichen ---
seg_map = {
    'consumer': 'Consumer',
    'consumers': 'Consumer',
    'corporate': 'Corporate',
    'corp': 'Corporate',
    'home office': 'Home Office',
    'home-office': 'Home Office',
    'homeoffice': 'Home Office',
}
df_clean['Segment'] = (
    df_clean['Segment'].astype('string').str.strip()
      .str.lower()
      .replace(seg_map)
      .str.title()
)
# "Home Office" wird durch .title() korrekt, aber falls "Home Office" mal kaputt geht:
df_clean['Segment'] = df_clean['Segment'].replace({'Home Office': 'Home Office'})


# --- SHIP MODE: Variationen vereinheitlichen ---
ship_map = {
    'first class': 'First Class',
    '1st class': 'First Class',
    'firstclass': 'First Class',

    'second class': 'Second Class',
    '2nd class': 'Second Class',
    'secondclass': 'Second Class',

    'same day': 'Same Day',
    'same-day': 'Same Day',
    'sameday': 'Same Day',

    'standard': 'Standard Class',
    'standard class': 'Standard Class',
    'std class': 'Standard Class',
}
df_clean['Ship Mode'] = (
    df_clean['Ship Mode'].astype('string').str.strip()
      .str.lower()
      .str.replace(r'\s+', ' ', regex=True)
      .replace(ship_map)
)

In [60]:
def show_uniques(df, col, sort=True):
    vals = df[col].dropna().unique()
    if sort:
        try:
            vals = sorted(vals)
        except TypeError:
            vals = sorted(map(str, vals))
    print(f"\nUnique values: {len(vals)}")
    print(f"{col}: {vals}")

show_uniques(df_clean, 'Discount')
show_uniques(df_clean, 'Segment')
show_uniques(df_clean, 'Ship Mode')



Unique values: 23
Discount: [np.float64(0.0), np.float64(0.07), np.float64(0.1), np.float64(0.15), np.float64(0.17), np.float64(0.2), np.float64(0.25), np.float64(0.27), np.float64(0.3), np.float64(0.32), np.float64(0.35), np.float64(0.37), np.float64(0.4), np.float64(0.45), np.float64(0.47), np.float64(0.5), np.float64(0.55), np.float64(0.57), np.float64(0.6), np.float64(0.65), np.float64(0.7), np.float64(0.8), np.float64(0.85)]

Unique values: 3
Segment: ['Consumer', 'Corporate', 'Home Office']

Unique values: 4
Ship Mode: ['First Class', 'Same Day', 'Second Class', 'Standard Class']


In [61]:
import pandas as pd

cols = ['Country', 'City', 'State']

def normalize_cap_first(s: pd.Series) -> pd.Series:
    return (s.astype('string')
             .str.replace(r'[\r\n\t]+', ' ', regex=True)
             .str.replace(r'\s+', ' ', regex=True)
             .str.strip()
             .str.strip("'\"")
             .str.lower()
             .str.capitalize())

# BEFORE: raw df (inkl. NA zählen)
before_counts = {c: df[c].astype('string').nunique(dropna=False) for c in cols}
print("BEFORE (df raw, incl NA):", before_counts)


# AFTER: nur Namensanpassung auf df_clean
for c in cols:
    df_clean[c] = normalize_cap_first(df_clean[c])

after_counts = {c: df_clean[c].astype('string').nunique(dropna=False) for c in cols}
print("AFTER (df_clean, nur Namen angepasst, incl NA):", after_counts)



BEFORE (df raw, incl NA): {'Country': 292, 'City': 5344, 'State': 1824}
AFTER (df_clean, nur Namen angepasst, incl NA): {'Country': 165, 'City': 3635, 'State': 1101}


In [62]:
s = df_clean["Country"].astype("string").str.strip().str.title()

# only lowercase small words when preceded by a space (i.e., not the first word)
small = r"(?<=\s)(Of|The|And|Or|In|On|At|To|For|From|By|De|Da|Di|Du|Del|La|Le|El)\b"
df_clean["Country"] = s.str.replace(small, lambda m: m.group(0).lower(), regex=True)


for country in sorted(df_clean["Country"].unique()):
  print(country)

Afghanistan
Albania
Algeria
Angola
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Central African Republic
Chad
Chile
China
Colombia
Costa Rica
Cote D'Ivoire
Croatia
Cuba
Cyprus
Czech Republic
Democratic Republic of the Congo
Denmark
Djibouti
Dominican Republic
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Ethiopia
Finland
France
French Guiana
Gabon
Georgia
Germany
Ghana
Greece
Guadeloupe
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hong Kong
Hungary
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Kuwait
Kyrgyzstan
Laos
Lebanon
Lesotho
Liberia
Libya
Lithuania
Luxembourg
Macedonia
Madagascar
Malawi
Malaysia
Mali
Martinique
Mauritania
Mexico
Moldova
Mongolia
Montenegro
Morocco
Mozambique
Myanmar (Burma)
Namibia
Nepal
Netherlands
New Zealand
Nicaragua
Niger
Nigeria
No

## PHASE 4: MISSING VALUES

In [63]:
print("\n" + "="*80)
print("PHASE 2: MISSING VALUES")
print("="*80)

# Check missing values
print("\n🔍 Missing values per column:")
missing = df_clean.isnull().sum()
if missing.sum() > 0:
    for col in missing[missing > 0].index:
        print(f"   {col}: {missing[col]:,} ({missing[col]/len(df_clean)*100:.2f}%)")
else:
    print("   ✓ No missing values found")

# Decision: Postal Code is OK to be missing (international data)


# Remove rows with missing CRITICAL fields
critical_fields = ['Order ID', 'Customer ID', 'Product ID', 'Order Date', 'Sales', 'Quantity']
before = len(df_clean)
df_clean = df_clean.dropna(subset=critical_fields)
removed = before - len(df_clean)

print("\nRows before: "+ str(before))
print(f"Removed {removed:,} rows with missing critical fields")
print(f"Remaining rows: {len(df_clean):,}")


PHASE 2: MISSING VALUES

🔍 Missing values per column:
   Customer Name: 2,576 (4.98%)
   Postal Code: 41,682 (80.51%)
   City: 1,534 (2.96%)
   State: 1,013 (1.96%)
   Sales: 232 (0.45%)
   Quantity: 171 (0.33%)

Rows before: 51773
Removed 403 rows with missing critical fields
Remaining rows: 51,370


## PHASE 5: DUPLICATES

In [64]:
print("\n" + "="*80)
print("PHASE 5: DUPLICATES")
print("="*80)

# Check for duplicate rows
duplicates = df_clean.duplicated().sum()
print(f"\n🔍 Total duplicate rows: {duplicates:,}")

if duplicates > 0:
    df_clean = df_clean.drop_duplicates()
    print(f"   📋 Removed {duplicates:,} duplicate rows")
else:
    print("   ✓ No duplicates found")

# Check for duplicate Row IDs
dup_ids = df_clean['Row ID'].duplicated().sum()
print(f"\n🔍 Duplicate Row IDs: {dup_ids:,}")
if dup_ids > 0:
    df_clean = df_clean.drop_duplicates(subset=['Row ID'], keep='first')
    print(f"   📋 Removed {dup_ids:,} duplicate Row IDs")
else:
    print("   ✓ All Row IDs are unique")

print(f"\n   Remaining rows: {len(df_clean):,}")


PHASE 5: DUPLICATES

🔍 Total duplicate rows: 945
   📋 Removed 945 duplicate rows

🔍 Duplicate Row IDs: 54
   📋 Removed 54 duplicate Row IDs

   Remaining rows: 50,371


## PHASE 6: DATA VALIDATION

In [65]:
df_clean.dtypes

Unnamed: 0,0
Row ID,int64
Order ID,object
Order Date,datetime64[ns]
Ship Date,datetime64[ns]
Ship Mode,string[python]
Customer ID,string[python]
Customer Name,string[python]
Segment,string[python]
Postal Code,string[python]
City,string[python]


In [66]:
print("\n" + "="*80)
print("PHASE 6: DATA VALIDATION")
print("="*80)

# Remove invalid values
print("\n🔍 Removing invalid values...")
before = len(df_clean)
print(df_clean.Sales.dtype)
df_clean = df_clean[(df_clean['Sales'] > 0) & (df_clean['Quantity'] > 0)]
print(f"   📋 Removed {before - len(df_clean):,} rows with negative/zero values")

# Fix date logic errors
print("\n📅 Validating dates...")
before = len(df_clean)
df_clean = df_clean[df_clean['Ship Date'] >= df_clean['Order Date']]
print(f"   📋 Removed {before - len(df_clean):,} rows with date errors")


PHASE 6: DATA VALIDATION

🔍 Removing invalid values...
Float64
   📋 Removed 763 rows with negative/zero values

📅 Validating dates...
   📋 Removed 242 rows with date errors


## PHASE 7: DERIVED COLUMNS

In [67]:
print("\n" + "="*80)
print("PHASE 7: DERIVED COLUMNS")
print("="*80)

print("\n📅 Adding derived date fields...")
df_clean['Year'] = df_clean['Order Date'].dt.year
df_clean['Month'] = df_clean['Order Date'].dt.month
df_clean['Month_Name'] = df_clean['Order Date'].dt.month_name()
df_clean['Quarter'] = df_clean['Order Date'].dt.quarter
df_clean['Day_of_Week'] = df_clean['Order Date'].dt.day_name()
df_clean['Week'] = df_clean['Order Date'].dt.isocalendar().week
df_clean['Delivery_Days'] = (df_clean['Ship Date'] - df_clean['Order Date']).dt.days
df_clean['Profit_Margin_Pct'] = (df_clean['Profit'] / df_clean['Sales']) * 100
print("   ✓ All derived columns added")


PHASE 7: DERIVED COLUMNS

📅 Adding derived date fields...
   ✓ All derived columns added


## PHASE 8: FINAL VALIDATION

In [68]:
print("\n" + "="*80)
print("PHASE 8: FINAL VALIDATION")
print("="*80)

print("\n📊 FINAL DATASET SUMMARY:")
print(f"   Original rows: {len(df):,}")
print(f"   Clean rows: {len(df_clean):,}")
print(f"   Rows removed: {len(df) - len(df_clean):,} ({(len(df) - len(df_clean))/len(df)*100:.2f}%)")
print(f"   Columns: {len(df_clean.columns)}")

print("\n📈 DATA QUALITY:")
print(f"   Revenue: ${df_clean['Sales'].sum():,.2f}")
print(f"   Profit: ${df_clean['Profit'].sum():,.2f}")
print(f"   Customers: {df_clean['Customer ID'].nunique():,}")
print(f"   Products: {df_clean['Product ID'].nunique():,}")

print("\n" + "="*80)
print("✅ DATA CLEANING COMPLETE!")
print("="*80)


PHASE 8: FINAL VALIDATION

📊 FINAL DATASET SUMMARY:
   Original rows: 52,315
   Clean rows: 49,366
   Rows removed: 2,949 (5.64%)
   Columns: 32

📈 DATA QUALITY:
   Revenue: $12,152,049.84
   Profit: $1,411,863.53
   Customers: 17,995
   Products: 3,784

✅ DATA CLEANING COMPLETE!


In [69]:
# Save clean dataset
df_clean.to_csv('/content/drive/MyDrive/Superstore Project/superstore_clean_FINAL.csv', index=False)
print("✅ Clean dataset saved: superstore_clean_FINAL.csv")

✅ Clean dataset saved: superstore_clean_FINAL.csv
