In [13]:
# ======================================
# 1. IMPORT LIBRARY DAN LOAD DATASET
# ======================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [15]:
# Load dataset
df = pd.read_csv("data_clean_ready_for_model.csv")

print("Shape dataset:", df.shape)
display(df.head())

Shape dataset: (705, 13)


Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,-1.729596,-1.186915,0,2,10,0.224201,1,1,-0.327638,-0.20552,1,0.157063,0.98555
1,-1.724682,0.958662,1,0,39,-2.244892,7,0,0.560423,1.605628,2,-2.97679,-2.166958
2,-1.719768,-0.471723,0,2,102,0.861386,6,1,-1.65973,-1.111095,0,1.201681,1.616052
3,-1.714855,-1.902108,1,1,101,-1.528059,11,0,0.116393,0.700054,2,-1.932172,-1.536456
4,-1.709941,0.24347,1,0,18,-0.333336,0,1,-0.771668,-0.20552,1,-0.887554,0.355048


In [28]:
# ======================================
# DEFINISI TARGET KLASIFIKASI (0 & 1)
# ======================================

import pandas as pd

# Load dataset siap modeling
df = pd.read_csv("data_clean_ready_for_model.csv")

# Buat kolom ADDICTION_CLASS dari Addicted_Score (sesuai langkah sebelumnya)
# Ini memastikan kolom target tersedia di df yang baru dimuat
if "Addicted_Score" in df.columns:
    threshold = df["Addicted_Score"].median()
    df["ADDICTION_CLASS"] = (df["Addicted_Score"] >= threshold).astype(int)
else:
    print("Error: Kolom 'Addicted_Score' tidak ditemukan di dataset untuk membuat ADDICTION_CLASS.")

TARGET = "ADDICTION_CLASS"

# Distribusi kelas target
class_dist = df[TARGET].value_counts().sort_index()
class_prop = df[TARGET].value_counts(normalize=True).sort_index()

summary_target = pd.DataFrame({
    "Jumlah Data": class_dist,
    "Proporsi": class_prop.round(3)
})

display(summary_target)

print("""
Keterangan Kelas:
0 = Tidak mengalami kecanduan media sosial
1 = Mengalami kecanduan media sosial
""")

Unnamed: 0_level_0,Jumlah Data,Proporsi
ADDICTION_CLASS,Unnamed: 1_level_1,Unnamed: 2_level_1
0,297,0.421
1,408,0.579



Keterangan Kelas:
0 = Tidak mengalami kecanduan media sosial
1 = Mengalami kecanduan media sosial



In [16]:
# ======================================
# 2. CEK STRUKTUR DAN TIPE DATA
# ======================================

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student_ID                    705 non-null    float64
 1   Age                           705 non-null    float64
 2   Gender                        705 non-null    int64  
 3   Academic_Level                705 non-null    int64  
 4   Country                       705 non-null    int64  
 5   Avg_Daily_Usage_Hours         705 non-null    float64
 6   Most_Used_Platform            705 non-null    int64  
 7   Affects_Academic_Performance  705 non-null    int64  
 8   Sleep_Hours_Per_Night         705 non-null    float64
 9   Mental_Health_Score           705 non-null    float64
 10  Relationship_Status           705 non-null    int64  
 11  Conflicts_Over_Social_Media   705 non-null    float64
 12  Addicted_Score                705 non-null    float64
dtypes: fl

In [17]:
# ======================================
# 3. DATA CLEANING & PREPROCESSING
# ======================================

# Salin dataset
df2 = df.copy()

# Tangani missing value numerik dengan median
num_cols = df2.select_dtypes(include=["int64", "float64"]).columns
df2[num_cols] = df2[num_cols].fillna(df2[num_cols].median())

# Encoding variabel kategorik
df2 = pd.get_dummies(df2, drop_first=True)

print("Shape setelah preprocessing:", df2.shape)


Shape setelah preprocessing: (705, 13)


In [19]:
# ======================================
# 4. MEMBENTUK TARGET KLASIFIKASI
# ======================================

# Pastikan kolom Addicted_Score ada
print(df2["Addicted_Score"].describe())

# Ubah Addicted_Score (kontinu) â†’ kelas biner
threshold = df2["Addicted_Score"].median()
df2["ADDICTION_CLASS"] = (df2["Addicted_Score"] >= threshold).astype(int)

print("Distribusi target:")
display(df2["ADDICTION_CLASS"].value_counts())

count    7.050000e+02
mean    -1.209434e-16
std      1.000710e+00
min     -2.797459e+00
25%     -9.059547e-01
50%      3.550484e-01
75%      9.855500e-01
max      1.616052e+00
Name: Addicted_Score, dtype: float64
Distribusi target:


Unnamed: 0_level_0,count
ADDICTION_CLASS,Unnamed: 1_level_1
1,408
0,297


In [21]:
# ======================================
# 5. MENENTUKAN FITUR DAN TARGET
# ======================================

TARGET = "ADDICTION_CLASS"

X = df2.drop(columns=["Addicted_Score", TARGET])
y = df2[TARGET]

print("Jumlah fitur:", X.shape[1])

Jumlah fitur: 12


In [22]:
# ======================================
# 6. SPLIT DATA & STANDARDISASI
# ======================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
# ======================================
# 7. MODEL BASELINE: LOGISTIC REGRESSION
# ======================================

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

y_pred_lr = logreg.predict(X_test_scaled)

print("Akurasi Logistic Regression:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Akurasi Logistic Regression: 0.9716312056737588
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        59
           1       0.95      1.00      0.98        82

    accuracy                           0.97       141
   macro avg       0.98      0.97      0.97       141
weighted avg       0.97      0.97      0.97       141



In [24]:
# ======================================
# 8. MODEL FINAL: DECISION TREE
# ======================================

dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Akurasi Decision Tree:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Akurasi Decision Tree: 0.9716312056737588
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        59
           1       0.99      0.96      0.98        82

    accuracy                           0.97       141
   macro avg       0.97      0.97      0.97       141
weighted avg       0.97      0.97      0.97       141

