In [None]:
# TP2.ipynb â€” Exploratory Data Analysis (EDA)
# Analisis Eksplorasi Data untuk Dataset Student Scores

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style untuk visualisasi
plt.style.use('seaborn-v0_8')

# === 1) Load Dataset ===
df = pd.read_csv("../data/student-scores.csv")

print("ðŸ“Š EKSPLORASI DATA (EDA)")
print("=" * 50)

print("--- 5 Baris Pertama ---")
print(df.head())

print("\n--- Nama Kolom ---")
print(df.columns.tolist())

print(f"\n--- Informasi Dataset ---")
print(f"Jumlah data: {len(df)}")
print(f"Dimensi: {df.shape}")
print(f"Tipe data:")
print(df.dtypes)


ðŸ“Š EKSPLORASI DATA (EDA)
--- 5 Baris Pertama ---
   id first_name last_name                                  email  gender  \
0   1       Paul     Casey         paul.casey.1@gslingacademy.com    male   
1   2   Danielle  Sandoval  danielle.sandoval.2@gslingacademy.com  female   
2   3       Tina   Andrews       tina.andrews.3@gslingacademy.com  female   
3   4       Tara     Clark         tara.clark.4@gslingacademy.com  female   
4   5    Anthony    Campos     anthony.campos.5@gslingacademy.com    male   

   part_time_job  absence_days  extracurricular_activities  \
0          False             3                       False   
1          False             2                       False   
2          False             9                        True   
3          False             5                       False   
4          False             5                       False   

   weekly_self_study_hours   career_aspiration  math_score  history_score  \
0                       27          

In [None]:
# === 2) Analisis Statistik Deskriptif ===
print("\nðŸ“ˆ STATISTIK DESKRIPTIF")
print("=" * 50)

print("Statistik untuk Weekly Self Study Hours:")
print(df['weekly_self_study_hours'].describe())

print("\nStatistik untuk Math Score:")
print(df['math_score'].describe())

# Analisis korelasi
correlation = df['weekly_self_study_hours'].corr(df['math_score'])
print(f"\nðŸ“Š Korelasi antara Jam Belajar dan Nilai Matematika: {correlation:.4f}")

# Interpretasi korelasi
if correlation > 0.7:
    strength = "SANGAT KUAT"
elif correlation > 0.5:
    strength = "KUAT"
elif correlation > 0.3:
    strength = "SEDANG"
elif correlation > 0.1:
    strength = "LEMAH"
else:
    strength = "SANGAT LEMAH"

print(f"ðŸ“ˆ Interpretasi: Hubungan {strength} ({'positif' if correlation > 0 else 'negatif'})")



ðŸ“ˆ STATISTIK DESKRIPTIF
Statistik untuk Weekly Self Study Hours:
count    2000.000000
mean       17.755500
std        12.129604
min         0.000000
25%         5.000000
50%        18.000000
75%        28.000000
max        50.000000
Name: weekly_self_study_hours, dtype: float64

Statistik untuk Math Score:
count    2000.000000
mean       83.452000
std        13.224906
min        40.000000
25%        77.000000
50%        87.000000
75%        93.000000
max       100.000000
Name: math_score, dtype: float64

ðŸ“Š Korelasi antara Jam Belajar dan Nilai Matematika: 0.3936
ðŸ“ˆ Interpretasi: Hubungan SEDANG (positif)


In [None]:
# === 3) Visualisasi Data ===
print("\nðŸ“Š VISUALISASI DATA")
print("=" * 50)

# Buat folder static jika belum ada
import os
os.makedirs('../static', exist_ok=True)

# Scatter plot dengan trend line
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='weekly_self_study_hours', y='math_score', 
                s=60, color='blue', alpha=0.7)
sns.regplot(data=df, x='weekly_self_study_hours', y='math_score', 
            scatter=False, color='red', line_kws={'linewidth': 3})
plt.title('Hubungan antara Jam Belajar dan Nilai Matematika', 
          fontsize=16, fontweight='bold')
plt.xlabel('Weekly Self Study Hours (Jam Belajar per Minggu)', fontsize=14)
plt.ylabel('Math Score (Nilai Matematika)', fontsize=14)
plt.grid(True, alpha=0.3)

# Simpan grafik
plt.savefig('../static/regression_scatter.png', dpi=150, bbox_inches='tight')
plt.close()

print("âœ… Grafik tersimpan di ../static/regression_scatter.png")

# Histogram distribusi
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['weekly_self_study_hours'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribusi Jam Belajar per Minggu')
plt.xlabel('Hours')
plt.ylabel('Frekuensi')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(df['math_score'], bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
plt.title('Distribusi Nilai Matematika')
plt.xlabel('Math Score')
plt.ylabel('Frekuensi')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../static/distribution_plots.png', dpi=150, bbox_inches='tight')
plt.close()

print("âœ… Grafik distribusi tersimpan di ../static/distribution_plots.png")



ðŸ“Š VISUALISASI DATA
âœ… Grafik tersimpan di ../static/regression_scatter.png
âœ… Grafik distribusi tersimpan di ../static/distribution_plots.png


In [None]:
# === 4) Ringkasan EDA ===
print("\nðŸ“Š RINGKASAN EKSPLORASI DATA")
print("=" * 50)

print(f"ðŸ“ˆ Ringkasan Dataset:")
print(f"â€¢ Jumlah siswa: {len(df)}")
print(f"â€¢ Range jam belajar: {df['weekly_self_study_hours'].min()} - {df['weekly_self_study_hours'].max()} jam/minggu")
print(f"â€¢ Range nilai matematika: {df['math_score'].min()} - {df['math_score'].max()}")
print(f"â€¢ Rata-rata jam belajar: {df['weekly_self_study_hours'].mean():.1f} jam/minggu")
print(f"â€¢ Rata-rata nilai matematika: {df['math_score'].mean():.1f}")
print(f"â€¢ Korelasi: {correlation:.4f} ({strength})")

print(f"\nâœ… EDA selesai!")
print("ðŸ“Š File grafik tersimpan di folder /static/")
print("ðŸ“ˆ Siap untuk tahap training model!")



ðŸ“Š RINGKASAN EKSPLORASI DATA
ðŸ“ˆ Ringkasan Dataset:
â€¢ Jumlah siswa: 2000
â€¢ Range jam belajar: 0 - 50 jam/minggu
â€¢ Range nilai matematika: 40 - 100
â€¢ Rata-rata jam belajar: 17.8 jam/minggu
â€¢ Rata-rata nilai matematika: 83.5
â€¢ Korelasi: 0.3936 (SEDANG)

âœ… EDA selesai!
ðŸ“Š File grafik tersimpan di folder /static/
ðŸ“ˆ Siap untuk tahap training model!
