In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style untuk plotting
plt.style.use('default')
sns.set_palette("husl")

In [3]:
# Fungsi helper untuk basic info
def basic_info(df, filename):
    print(f"\nüìä {filename}")
    print("-" * 40)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")
    print(f"Missing values:\n{df.isnull().sum()}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024:.2f} KB")
    return df.head()

In [4]:
datasets = {}
file_names = [
    'MineToday Dataset/train/train_absensi.csv',
    'MineToday Dataset/train/train_mini_project.csv', 
    'MineToday Dataset/train/train_pendaftaran.csv',
    'MineToday Dataset/train/train_pretest_ml.csv',
    'MineToday Dataset/train/train_pretest_py.csv',
    'MineToday Dataset/train/train_pretest_st.csv',
    'MineToday Dataset/train/train_weekly_quiz.csv'
]

In [5]:
print("üìÇ LOADING DATASETS...")
for file_name in file_names:
    try:
        df = pd.read_csv(file_name)
        datasets[file_name] = df
        print(f"‚úÖ {file_name} loaded successfully - Shape: {df.shape}")
    except FileNotFoundError:
        print(f"‚ùå {file_name} not found")
    except Exception as e:
        print(f"‚ùå Error loading {file_name}: {str(e)}")

print(f"\nüìà Total datasets loaded: {len(datasets)}")

üìÇ LOADING DATASETS...
‚úÖ MineToday Dataset/train/train_absensi.csv loaded successfully - Shape: (11714, 12)
‚úÖ MineToday Dataset/train/train_mini_project.csv loaded successfully - Shape: (468, 5)
‚úÖ MineToday Dataset/train/train_pendaftaran.csv loaded successfully - Shape: (492, 9)
‚úÖ MineToday Dataset/train/train_pretest_ml.csv loaded successfully - Shape: (502, 14)
‚úÖ MineToday Dataset/train/train_pretest_py.csv loaded successfully - Shape: (544, 14)
‚úÖ MineToday Dataset/train/train_pretest_st.csv loaded successfully - Shape: (500, 19)
‚úÖ MineToday Dataset/train/train_weekly_quiz.csv loaded successfully - Shape: (487, 5)

üìà Total datasets loaded: 7


In [6]:
# Analisis detail setiap dataset
for file_name, df in datasets.items():
    print(f"\n{'='*60}")
    sample_data = basic_info(df, file_name)
    print(f"\nSample data (first 3 rows):")
    print(sample_data.head(3))
    
    # Cek unique values untuk kolom kategorikal
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\nüè∑Ô∏è  Categorical columns unique values:")
        for col in categorical_cols[:5]:  # Limit to first 5 columns
            unique_vals = df[col].nunique()
            print(f"  {col}: {unique_vals} unique values")
            if unique_vals <= 10:
                print(f"    Values: {df[col].unique()[:10]}")



üìä MineToday Dataset/train/train_absensi.csv
----------------------------------------
Shape: (11714, 12)
Columns: ['id', 'Timestamp', 'Tanggal hari ini ', 'Pertemuan ke-', 'Kualitas materi ', 'Siapa trainer pada hari ini ?', 'Bagaimana menurut Trainer pada hari ini ?', 'Kualitas trainer ', 'Apakah ada saran secara keseluruhan ?', 'Kamu mengikuti Bootcamp Batch ?', 'Tanggal hari ini', 'Pertemuan ke']
Data types:
id                                            object
Timestamp                                     object
Tanggal hari ini                              object
Pertemuan ke-                                 object
Kualitas materi                                int64
Siapa trainer pada hari ini ?                 object
Bagaimana menurut Trainer pada hari ini ?     object
Kualitas trainer                               int64
Apakah ada saran secara keseluruhan ?         object
Kamu mengikuti Bootcamp Batch ?              float64
Tanggal hari ini                              objec

In [7]:
# Analisis ID peserta untuk join datasets
print(f"\n{'='*60}")
print("üîó PARTICIPANT ID ANALYSIS")
print("-" * 40)

# Cari kolom yang mungkin berisi ID peserta
id_candidates = []
for file_name, df in datasets.items():
    for col in df.columns:
        if any(keyword in col.lower() for keyword in ['id', 'email', 'nama', 'timestamp']):
            id_candidates.append((file_name, col, df[col].nunique()))

print("Potential ID columns:")
for file_name, col, unique_count in id_candidates:
    print(f"  {file_name}: {col} ({unique_count} unique values)")



üîó PARTICIPANT ID ANALYSIS
----------------------------------------
Potential ID columns:
  MineToday Dataset/train/train_absensi.csv: id (509 unique values)
  MineToday Dataset/train/train_absensi.csv: Timestamp (425 unique values)
  MineToday Dataset/train/train_mini_project.csv: id (468 unique values)
  MineToday Dataset/train/train_mini_project.csv: Timestamp (10 unique values)
  MineToday Dataset/train/train_mini_project.csv: Share link Google Slide atau Canva kamu disini (jangan lupa diberi akses view) (3 unique values)
  MineToday Dataset/train/train_pendaftaran.csv: id (492 unique values)
  MineToday Dataset/train/train_pendaftaran.csv: Timestamp (15 unique values)
  MineToday Dataset/train/train_pendaftaran.csv: Nama Kampus / Sekolah / Instansi (15 unique values)
  MineToday Dataset/train/train_pendaftaran.csv: Alasan Mengikuti Bootcamp Data Science di Intelligo ID (15 unique values)
  MineToday Dataset/train/train_pretest_ml.csv: id (494 unique values)
  MineToday Dataset/

In [8]:
# Cek timestamp patterns
print(f"\n‚è∞ TIMESTAMP ANALYSIS")
print("-" * 40)
for file_name, df in datasets.items():
    timestamp_cols = [col for col in df.columns if 'timestamp' in col.lower() or 'tanggal' in col.lower()]
    if timestamp_cols:
        print(f"\n{file_name}:")
        for col in timestamp_cols:
            print(f"  {col}: {df[col].dtype}")
            print(f"    Sample: {df[col].dropna().head(2).tolist()}")



‚è∞ TIMESTAMP ANALYSIS
----------------------------------------

MineToday Dataset/train/train_absensi.csv:
  Timestamp: object
    Sample: ['9/28/2023 21:20:43', '9/28/2023 21:28:13']
  Tanggal hari ini : object
    Sample: ['9/28/2023', '9/28/2023']
  Tanggal hari ini: object
    Sample: ['6/18/2024', '5/2/2024']

MineToday Dataset/train/train_mini_project.csv:
  Timestamp: object
    Sample: ['8/4/2024 17:07:07', '8/9/2024 11:14:41']

MineToday Dataset/train/train_pendaftaran.csv:
  Timestamp: object
    Sample: ['1/12/2024 19:44:23', '3/3/2024 12:25:26']

MineToday Dataset/train/train_pretest_ml.csv:
  Timestamp: object
    Sample: ['11/21/2023 19:49:17', '11/21/2023 19:51:03']

MineToday Dataset/train/train_pretest_py.csv:
  Timestamp: object
    Sample: ['10/10/2023 19:46:09', '1/15/2024 10:13:07']

MineToday Dataset/train/train_pretest_st.csv:
  Timestamp: object
    Sample: ['11/30/2023 20:08:17', '2/29/2024 15:20:25']

MineToday Dataset/train/train_weekly_quiz.csv:
  Timestam

In [9]:
# Summary statistics untuk numerical columns
print(f"\n{'='*60}")
print("üìä NUMERICAL COLUMNS SUMMARY")
print("-" * 40)

for file_name, df in datasets.items():
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if len(numerical_cols) > 0:
        print(f"\n{file_name}:")
        print(df[numerical_cols].describe().round(2))


üìä NUMERICAL COLUMNS SUMMARY
----------------------------------------

MineToday Dataset/train/train_absensi.csv:
       Kualitas materi   Kualitas trainer   Kamu mengikuti Bootcamp Batch ?
count           11714.0           11714.00                         11688.00
mean                4.5               4.51                             6.06
std                 0.7               0.69                             0.52
min                 2.0               2.00                             6.00
25%                 4.0               4.00                             6.00
50%                 5.0               5.00                             6.00
75%                 5.0               5.00                             6.00
max                 5.0               5.00                            13.00

MineToday Dataset/train/train_mini_project.csv:
       Unnamed: 0
count      462.00
mean       245.73
std        143.45
min          0.00
25%        123.25
50%        243.50
75%        369.75
max   

In [11]:
# Pattern analysis untuk potential target creation
print(f"\n{'='*60}")
print("üéØ POTENTIAL TARGET PATTERNS")
print("-" * 40)

# Analisis completion patterns
completion_indicators = []
for file_name, df in datasets.items():
    if 'absensi' in file_name:
        print(f"\n{file_name}:")
        if 'Pertemuan ke' in df.columns or 'Pertemuan ke-' in df.columns:
            pertemuan_col = 'Pertemuan ke' if 'Pertemuan ke' in df.columns else 'Pertemuan ke-'
            print(f"  Column: {pertemuan_col}")
            print(f"  Data type: {df[pertemuan_col].dtype}")
            print(f"  Unique values: {sorted(df[pertemuan_col].dropna().unique())}")
            print(f"  Missing values: {df[pertemuan_col].isnull().sum()}")
            
            # Convert to numeric untuk cari max
            try:
                numeric_values = pd.to_numeric(df[pertemuan_col], errors='coerce')
                max_pertemuan = numeric_values.max()
                print(f"  Max pertemuan: {max_pertemuan}")
                print(f"  Min pertemuan: {numeric_values.min()}")
                print(f"  Total attendances: {len(df)}")
            except Exception as e:
                print(f"  Error processing pertemuan: {e}")
            
    elif 'quiz' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total quiz records: {df.shape[0]}")
        # Cek jika ada kolom score/nilai
        score_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['score', 'nilai', 'point'])]
        if score_cols:
            for col in score_cols[:3]:  # Limit to first 3 score columns
                print(f"  {col}: mean={df[col].mean():.2f}, std={df[col].std():.2f}")
        
    elif 'mini_project' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total project submissions: {df.shape[0]}")
        # Cek jika ada link submissions
        link_cols = [col for col in df.columns if 'link' in col.lower() or 'url' in col.lower()]
        if link_cols:
            for col in link_cols:
                non_empty = df[col].dropna().shape[0]
                print(f"  {col}: {non_empty} non-empty submissions")
        
    elif 'pretest' in file_name:
        test_type = file_name.split('_')[-1].replace('.csv', '').upper()
        print(f"\n{file_name}:")
        print(f"  Pretest {test_type} participants: {df.shape[0]}")
        # Cek score columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            for col in numeric_cols[:3]:  # First 3 numeric columns
                print(f"  {col}: mean={df[col].mean():.2f}, range=[{df[col].min():.1f}, {df[col].max():.1f}]")
    
    elif 'pendaftaran' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total registrations: {df.shape[0]}")
        if 'Status' in df.columns:
            status_counts = df['Status'].value_counts()
            print(f"  Status distribution: {dict(status_counts)}")


üéØ POTENTIAL TARGET PATTERNS
----------------------------------------

MineToday Dataset/train/train_absensi.csv:
  Column: Pertemuan ke
  Data type: object
  Unique values: ['Pertemuan 1', 'Pertemuan 10', 'Pertemuan 11', 'Pertemuan 12', 'Pertemuan 13', 'Pertemuan 14', 'Pertemuan 15', 'Pertemuan 16', 'Pertemuan 17', 'Pertemuan 18', 'Pertemuan 19', 'Pertemuan 2', 'Pertemuan 20', 'Pertemuan 21', 'Pertemuan 22', 'Pertemuan 23', 'Pertemuan 24', 'Pertemuan 25', 'Pertemuan 26', 'Pertemuan 27', 'Pertemuan 28', 'Pertemuan 29', 'Pertemuan 3', 'Pertemuan 30', 'Pertemuan 4', 'Pertemuan 5', 'Pertemuan 6', 'Pertemuan 7', 'Pertemuan 8', 'Pertemuan 9']
  Missing values: 249
  Max pertemuan: nan
  Min pertemuan: nan
  Total attendances: 11714

MineToday Dataset/train/train_mini_project.csv:
  Total project submissions: 468
  Share link Google Slide atau Canva kamu disini (jangan lupa diberi akses view): 468 non-empty submissions

MineToday Dataset/train/train_pendaftaran.csv:
  Total registrations:

In [12]:
print(f"\nüìà CROSS-DATASET ANALYSIS")
print("-" * 40)

# Coba identifikasi common participants
participant_counts = {}
for file_name, df in datasets.items():
    # Cari kolom yang mungkin identifier
    for col in df.columns:
        if 'email' in col.lower() or 'nama' in col.lower():
            unique_participants = df[col].nunique()
            participant_counts[f"{file_name}_{col}"] = unique_participants
            print(f"{file_name} - {col}: {unique_participants} unique participants")

if participant_counts:
    print(f"\nParticipant overlap analysis needed for joining datasets")


üìà CROSS-DATASET ANALYSIS
----------------------------------------
MineToday Dataset/train/train_pendaftaran.csv - Nama Kampus / Sekolah / Instansi: 15 unique participants

Participant overlap analysis needed for joining datasets
