In [1]:
import numpy as np
import pandas as pd

In [2]:
#importing the dataset
data = pd.read_csv(r'C:\Users\kejri\OneDrive\Desktop\MLOps\Learning ML from Basics\Projects\Students Social Media Addiction\data\Students Social Media Addiction.csv')

In [3]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Addicted_Score', 'Country', 'Student_ID'])
y = data['Addicted_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [4]:
from sklearn.compose import ColumnTransformer
from utils import IQRClipper
from sklearn import set_config

set_config(transform_output="pandas")  

# The set_config helps to keep the column names through the transformations. It converts the numpy array to pandas dataframe after every transformations.

In [5]:
trfs1 = ColumnTransformer([
    ('Outliers Treatment', IQRClipper(), ['Age','Avg_Daily_Usage_Hours','Sleep_Hours_Per_Night','Mental_Health_Score','Conflicts_Over_Social_Media'])
], remainder='passthrough', verbose_feature_names_out=False)

In [6]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', dtype= np.int32, sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown= 'use_encoded_value', unknown_value=-1)

log_trf = FunctionTransformer(np.log1p, feature_names_out='one-to-one')
sqrt_trf = FunctionTransformer(np.sqrt, feature_names_out='one-to-one')

In [7]:
trfs2= ColumnTransformer([
    ('One Hot Encoding', ohe, ['Academic_Level', 'Most_Used_Platform', 'Relationship_Status']),
    ('Ordinal Encoding', oe, ['Gender', 'Affects_Academic_Performance']),
    ('Log Transformation', log_trf, ['Conflicts_Over_Social_Media', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night']),
    ('Square Root Transformation', sqrt_trf, ['Mental_Health_Score', 'Age']) 
], remainder='passthrough', verbose_feature_names_out=False)

In [8]:
from sklearn.decomposition import PCA

trfs3 = PCA(n_components= 0.99) # will keep the 95% of the variance

In [9]:
from sklearn.linear_model import Lasso

trfs4 = Lasso(alpha = 0.0002)

In [10]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Feature pipeline (X transformations)
X_transformer = Pipeline([
    ('Outliers Treatment', trfs1),
    ('Feature Engineering', trfs2),
    ('PCA', trfs3),
    ('Model', trfs4)
])

In [11]:
# Target transformer (y transformations)
y_transformer = Pipeline([
    # ('outlier_clip', IQRClipper()),
    ('scaler', MinMaxScaler())
])

TransformedTargetRegressor Class of the sklearn helps to train the model on a transformed version of the target, but return predictions in the original scale automatically. It applies the transformations and inverse transformations automatically to the target variable while training and predicting, so that the predictions are human readable.      
But you can't use this class when you have your own class where inverse_transformation function is missing. 

In [12]:
# Wrap everything together
final_model = TransformedTargetRegressor(
    regressor=X_transformer,
    transformer=y_transformer
)

In [13]:
# Fit and predict (automatic inverse transform!)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [14]:
# Evaluate
from sklearn.metrics import r2_score
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

R² Score: 0.9597


The model has an R² score of 0.9597, meaning it captures about 96% of the underlying patterns in the data.

In [15]:
# Checking with new custom data

new_student = pd.DataFrame([{
    "Age": 22,
    "Gender": 'Male',
    "Academic_Level": 'Undergraduate',
    "Avg_Daily_Usage_Hours": 5.10,
    "Most_Used_Platform": 'Instagram',
    "Affects_Academic_Performance" : 'Yes',
    "Sleep_Hours_Per_Night" : 8,
    "Mental_Health_Score": 4.5,
    "Relationship_Status": 'Single',
    "Conflicts_Over_Social_Media": 2
}])

raw = final_model.predict(new_student)[0]
raw = min(max(raw, 2), 9)   

print(raw)

percent = (raw - 2) / 7 * 100
print(f"Addiction Score: {percent:.2f}%")

6.971212865762953
Addiction Score: 71.02%


In [16]:
import joblib

joblib.dump(final_model, "addiction_model.pkl")

['addiction_model.pkl']