In [477]:
import numpy as np
import pandas as pd

In [478]:
#importing the dataset
data = pd.read_csv('Students Social Media Addiction.csv')

In [479]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Addicted_Score', 'Country', 'Student_ID'])
y = data['Addicted_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [480]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn import set_config

set_config(transform_output="pandas")  # This keeps column names through transformations

# Class to Cap the outliers between the min and max range in all the features.
class IQRClipper(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.q1 = np.percentile(X, 25, axis=0)
        self.q3 = np.percentile(X, 75, axis=0)
        self.iqr = self.q3 - self.q1
        return self

    def transform(self, X):
        X_copy = X.copy()
        lower = self.q1 - 1.5 * self.iqr
        upper = self.q3 + 1.5 * self.iqr
        X_clipped = np.clip(X_copy, lower, upper)
        
        # Preserve DataFrame structure if input is DataFrame
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X_clipped, columns=X.columns, index=X.index)
        return X_clipped

In [481]:
trfs1 = ColumnTransformer([
    ('Outliers Treatment', IQRClipper(), ['Age','Avg_Daily_Usage_Hours','Sleep_Hours_Per_Night','Mental_Health_Score','Conflicts_Over_Social_Media'])
], remainder='passthrough', verbose_feature_names_out=False)

In [482]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', dtype= np.int32, sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown= 'use_encoded_value', unknown_value=-1)

log_trf = FunctionTransformer(np.log1p, feature_names_out='one-to-one')
sqrt_trf = FunctionTransformer(np.sqrt, feature_names_out='one-to-one')

In [483]:
trfs2= ColumnTransformer([
    ('One Hot Encoding', ohe, ['Academic_Level', 'Most_Used_Platform', 'Relationship_Status']),
    ('Ordinal Encoding', oe, ['Gender', 'Affects_Academic_Performance']),
    ('Log Transformation', log_trf, ['Conflicts_Over_Social_Media', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night']),
    ('Square Root Transformation', sqrt_trf, ['Mental_Health_Score', 'Age']) 
], remainder='passthrough', verbose_feature_names_out=False)

In [484]:
from sklearn.decomposition import PCA

trfs3 = PCA(n_components= 0.99) # will keep the 95% of the variance

In [485]:
from sklearn.linear_model import Lasso

trfs4 = Lasso(alpha = 0.0002)

In [486]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Feature pipeline (X transformations)
X_transformer = Pipeline([
    ('Outliers Treatment', trfs1),
    ('Feature Engineering', trfs2),
    ('PCA', trfs3),
    ('Model', trfs4)
])

In [487]:
# Target transformer (y transformations)
y_transformer = Pipeline([
    # ('outlier_clip', IQRClipper()),
    ('scaler', MinMaxScaler())
])

In [488]:
# Wrap everything together
final_model = TransformedTargetRegressor(
    regressor=X_transformer,
    transformer=y_transformer
)

In [489]:
# Fit and predict (automatic inverse transform!)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [490]:
# Evaluate
from sklearn.metrics import r2_score
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

R² Score: 0.9597


In [492]:
new_student = pd.DataFrame([{
    "Age": 22,
    "Gender": 'Male',
    "Academic_Level": 'Graduate',
    "Avg_Daily_Usage_Hours": 2.1,
    "Most_Used_Platform": 'Twitter',
    "Affects_Academic_Performance" : 'No',
    "Sleep_Hours_Per_Night" : 7.5,
    "Mental_Health_Score": 8,
    "Relationship_Status": 'Single',
    "Conflicts_Over_Social_Media": 0
}])

raw = final_model.predict(new_student)[0]
raw = min(max(raw, 2), 9)   

print(raw)

percent = (raw - 2) / 7 * 100
print(f"Addiction Score: {percent:.2f}%")

2
Addiction Score: 0.00%
