<h1 style="color:lightblue;"><b><u>Import Library</u></b></h1>

In [1]:
import pandas as pd
import os # Library untuk berinteraksi dengan sistem operasi (cek file)

# Setting agar pandas menampilkan semua kolom (jika datanya lebar)
pd.set_option("display.max_columns", None)
print("Libraries berhasil di-import.")

Libraries berhasil di-import.


<h1 style="color:lightblue;"><b><u>Tentukan Path dan Muat Data</u></b></h1>

In [3]:
# Path dari folder 'notebooks' ke file CSV di folder 'data'
data_path = "../data/SuperStore.csv"

try:
    df = pd.read_csv(data_path)
    print(f"Berhasil memuat data dari: {data_path}")
    print(f"Jumlah baris data awal: {len(df)}")
except FileNotFoundError:
    print(f"ERROR: File tidak ditemukan di '{data_path}'")
    # Menampilkan path absolut untuk membantu debugging jika gagal
    print(f"Pastikan file ada di: {os.path.abspath(data_path)}")

# Tampilkan 5 baris pertama untuk inspeksi
df.head()

Berhasil memuat data dari: ../data/SuperStore.csv
Jumlah baris data awal: 9994


Unnamed: 0,Order_ID,Customer_ID,Postal_Code,Product_ID,Sales,Quantity,Discount,Profit,Category,Sub-Category,Product_Name,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Country/Region,City,State,Region
0,CA-2019-152156,CG-12520,42420,FUR-BO-10001798,261.96,2,0.0,41.9136,Furniture,Bookcases,Bush Somerset Collection Bookcase,11/8/2019,11/11/2019,Second Class,Claire Gute,Consumer,United States,Henderson,Kentucky,South
1,CA-2019-152156,CG-12520,42420,FUR-CH-10000454,731.94,3,0.0,219.582,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",11/8/2019,11/11/2019,Second Class,Claire Gute,Consumer,United States,Henderson,Kentucky,South
2,CA-2019-138688,DV-13045,90036,OFF-LA-10000240,14.62,2,0.0,6.8714,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,6/12/2019,6/16/2019,Second Class,Darrin Van Huff,Corporate,United States,Los Angeles,California,West
3,US-2018-108966,SO-20335,33311,FUR-TA-10000577,957.5775,5,0.45,-383.031,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,10/11/2018,10/18/2018,Standard Class,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,South
4,US-2018-108966,SO-20335,33311,OFF-ST-10000760,22.368,2,0.2,2.5164,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,10/11/2018,10/18/2018,Standard Class,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,South


<h1 style="color:lightblue;"><b><u>Cek Info dan Missing Values</u></b></h1>

In [4]:
print("--- Info Tipe Data ---")
# .info() memberi kita tipe data dan jumlah data non-null
df.info()

print("\n--- Cek Missing Values ---")
# .isnull().sum() menghitung jumlah data KOSONG per kolom
missing_values = df.isnull().sum()

# Hanya tampilkan kolom yang punya data kosong (jika ada)
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("-> Bagus! Tidak ada data yang hilang (missing values).")
else:
    print("-> Ditemukan data yang hilang. Perlu ditangani.")

--- Info Tipe Data ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order_ID        9994 non-null   object 
 1   Customer_ID     9994 non-null   object 
 2   Postal_Code     9994 non-null   int64  
 3   Product_ID      9994 non-null   object 
 4   Sales           9994 non-null   float64
 5   Quantity        9994 non-null   int64  
 6   Discount        9994 non-null   float64
 7   Profit          9994 non-null   float64
 8   Category        9994 non-null   object 
 9   Sub-Category    9994 non-null   object 
 10  Product_Name    9994 non-null   object 
 11  Order_Date      9994 non-null   object 
 12  Ship_Date       9994 non-null   object 
 13  Ship_Mode       9994 non-null   object 
 14  Customer_Name   9994 non-null   object 
 15  Segment         9994 non-null   object 
 16  Country/Region  9994 non-null   object 
 17  City      

<h1 style="color:lightblue;"><b><u>Perbaiki Tipe Data Tanggal</u></b></h1>

<h3 style="color:lightblue;">Kita ubah Order_Date dan Ship_Date dari teks (object) menjadi tanggal (datetime).</h3>

In [6]:
print("Mengubah tipe data 'Order_Date' dan 'Ship_Date' menjadi datetime...")

df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Ship_Date'] = pd.to_datetime(df['Ship_Date'])

print("-> Tipe data berhasil diubah.")

# Cek hasilnya
print("\nTipe data baru:")
df.dtypes[['Order_Date', 'Ship_Date']]

Mengubah tipe data 'Order_Date' dan 'Ship_Date' menjadi datetime...
-> Tipe data berhasil diubah.

Tipe data baru:


Order_Date    datetime64[ns]
Ship_Date     datetime64[ns]
dtype: object

<h1 style="color:lightblue;"><b><u>Hapus Kolom Tidak Relevan</u></b></h1>

<h3 style="color:lightblue;">Kita Hapus Country/Region karena hanya punya satu nilai unik.</h3>

In [7]:
print("Menghapus kolom 'Country/Region'...")
try:
    # axis=1 berarti kita menghapus KOLOM
    df = df.drop(columns=['Country/Region'])
    print("-> Kolom 'Country/Region' berhasil dihapus.")
except KeyError:
    print("-> Kolom 'Country/Region' sudah tidak ada atau salah nama.")

# Tampilkan kolom tersisa
print(f"\nKolom tersisa: {df.columns.to_list()}")

Menghapus kolom 'Country/Region'...
-> Kolom 'Country/Region' berhasil dihapus.

Kolom tersisa: ['Order_ID', 'Customer_ID', 'Postal_Code', 'Product_ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Category', 'Sub-Category', 'Product_Name', 'Order_Date', 'Ship_Date', 'Ship_Mode', 'Customer_Name', 'Segment', 'City', 'State', 'Region']


<h1 style="color:lightblue;"><b><u>Cek Data Duplikat (Baris Penuh)</u></b></h1>

<h3 style="color:lightblue;">Mengecek apakah ada baris yang semua datanya sama persis.</h3>

In [8]:
# .duplicated().sum() menghitung berapa banyak baris yang identik
duplicate_rows = df.duplicated().sum()
print(f"Jumlah baris duplikat (identik): {duplicate_rows}")

if duplicate_rows > 0:
    print("-> Menghapus baris duplikat...")
    df = df.drop_duplicates()
    print(f"-> Baris duplikat dihapus. Jumlah baris kini: {len(df)}")
else:
    print("-> Tidak ada baris duplikat yang identik.")

Jumlah baris duplikat (identik): 1
-> Menghapus baris duplikat...
-> Baris duplikat dihapus. Jumlah baris kini: 9993


<h1 style="color:lightblue;"><b><u>Simpan Data Bersih</u></b></h1>

<h3 style="color:lightblue;">Terakhir, kita simpan data yang sudah bersih ke folder data agar rapi.</h3>

In [9]:
# Tentukan path untuk menyimpan file bersih
# Kita simpan kembali di dalam folder 'data'
cleaned_data_path = "../data/SuperStore_Cleaned.csv"

# index=False agar nomor index (0, 1, 2...) tidak ikut tersimpan di file
df.to_csv(cleaned_data_path, index=False)

print(f"--- Proses Data Cleaning Selesai! ---")
print(f"Data bersih telah disimpan di: {cleaned_data_path}")

# Tampilkan 5 baris pertama dari data bersih sebagai konfirmasi
df.head()

--- Proses Data Cleaning Selesai! ---
Data bersih telah disimpan di: ../data/SuperStore_Cleaned.csv


Unnamed: 0,Order_ID,Customer_ID,Postal_Code,Product_ID,Sales,Quantity,Discount,Profit,Category,Sub-Category,Product_Name,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,City,State,Region
0,CA-2019-152156,CG-12520,42420,FUR-BO-10001798,261.96,2,0.0,41.9136,Furniture,Bookcases,Bush Somerset Collection Bookcase,2019-11-08,2019-11-11,Second Class,Claire Gute,Consumer,Henderson,Kentucky,South
1,CA-2019-152156,CG-12520,42420,FUR-CH-10000454,731.94,3,0.0,219.582,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",2019-11-08,2019-11-11,Second Class,Claire Gute,Consumer,Henderson,Kentucky,South
2,CA-2019-138688,DV-13045,90036,OFF-LA-10000240,14.62,2,0.0,6.8714,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,2019-06-12,2019-06-16,Second Class,Darrin Van Huff,Corporate,Los Angeles,California,West
3,US-2018-108966,SO-20335,33311,FUR-TA-10000577,957.5775,5,0.45,-383.031,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,2018-10-11,2018-10-18,Standard Class,Sean O'Donnell,Consumer,Fort Lauderdale,Florida,South
4,US-2018-108966,SO-20335,33311,OFF-ST-10000760,22.368,2,0.2,2.5164,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,2018-10-11,2018-10-18,Standard Class,Sean O'Donnell,Consumer,Fort Lauderdale,Florida,South
