In [7]:
# =========================================================
# Data Mining Ekspor-Import (Versi Stabil Tanpa Error)
# =========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# === 1. Baca Dataset ===
file_path = "/content/Ekspor import.xls"  # Ganti sesuai lokasi file kamu
df = pd.read_excel(file_path)

print("===== Informasi Awal Dataset =====")
print(df.info())
print("\n===== 5 Data Teratas =====")
print(df.head())

# === 2. Bersihkan Data ===
print("\n===== Cek Missing Values =====")
print(df.isnull().sum())

# Hapus baris kosong total
df = df.dropna(how='all')

# Coba ubah kolom yang bisa menjadi angka
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

# === 3. Pastikan nama kolom string semua ===
df.columns = df.columns.map(str)

# === 4. Cek kolom numerik ===
numeric_df = df.select_dtypes(include=[np.number])
print("\nKolom numerik yang terdeteksi:", list(numeric_df.columns))

if numeric_df.shape[1] == 0:
    raise ValueError("⚠️ Tidak ada kolom numerik yang bisa dianalisis. Periksa format angka di Excel.")

# === 5. Analisis Deskriptif (EDA) ===
print("\n===== Statistik Deskriptif =====")
print(numeric_df.describe())

# Visualisasi distribusi numerik
if numeric_df.shape[1] > 1:
    numeric_df.hist(figsize=(10, 6), bins=15)
    plt.suptitle("Distribusi Data Numerik Ekspor & Impor")
    plt.show()
else:
    col = numeric_df.columns[0]
    plt.figure(figsize=(6, 4))
    sns.histplot(numeric_df[col], kde=True)
    plt.title(f"Distribusi Kolom: {col}")
    plt.show()

# Korelasi antar variabel (jika lebih dari 1 kolom)
if numeric_df.shape[1] > 1:
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
    plt.title("Korelasi antar Variabel Numerik")
    plt.show()

# === 6. Normalisasi Data ===
scaler = StandardScaler()
scaled = scaler.fit_transform(numeric_df)
scaled_df = pd.DataFrame(scaled, columns=numeric_df.columns)

# === 7. Tentukan Jumlah Cluster (Metode Elbow) ===
inertias = []
K = range(1, 10)
for k in K:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(scaled_df)
    inertias.append(model.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K, inertias, 'bo-')
plt.xlabel('Jumlah Cluster (k)')
plt.ylabel('Inertia')
plt.title('Metode Elbow untuk Menentukan k Optimal')
plt.show()

# Ambil k=3 sebagai contoh
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_df)

# === 8. Visualisasi Cluster ===
if numeric_df.shape[1] >= 2:
    plt.figure(figsize=(8,6))
    sns.scatterplot(
        x=numeric_df.iloc[:, 0],
        y=numeric_df.iloc[:, 1],
        hue=df['Cluster'],
        palette='Set2'
    )
    plt.title("Visualisasi Cluster Berdasarkan Dua Fitur Pertama")
    plt.xlabel(numeric_df.columns[0])
    plt.ylabel(numeric_df.columns[1])
    plt.show()
else:
    print("\n⚠️ Hanya ada satu kolom numerik, visualisasi cluster 2D tidak bisa dibuat.")

# === 9. Simpan Hasil Akhir ===
output_file = "Hasil_Clustering_Ekspor_Impor.xlsx"
df.to_excel(output_file, index=False)
print(f"\n✅ Analisis selesai! Hasil disimpan ke file: {output_file}")


===== Informasi Awal Dataset =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 6 columns):
 #   Column                                                               Non-Null Count  Dtype 
---  ------                                                               --------------  ----- 
 0   Sumber : https://www.bps.go.id diakses pada 12-11-2023 17:11:13 WIB  13 non-null     object
 1   Unnamed: 1                                                           13 non-null     object
 2   Unnamed: 2                                                           13 non-null     object
 3   Unnamed: 3                                                           13 non-null     object
 4   Unnamed: 4                                                           13 non-null     object
 5   Unnamed: 5                                                           13 non-null     object
dtypes: object(6)
memory usage: 756.0+ bytes
None

===== 5 Data Teratas =====
  Sumber 

  df[col] = pd.to_numeric(df[col], errors='ignore')


ValueError: ⚠️ Tidak ada kolom numerik yang bisa dianalisis. Periksa format angka di Excel.