In [None]:
import numpy as np
import pandas as pd

Read Data

In [None]:
data=pd.read_csv('mobile_addiction.csv')

Getting to know data

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()
#After seeing the statistics, the data seems to be not skewed as its mean is almost equal to the median

Data Preprocessing

In [None]:
data = data.drop_duplicates()

In [None]:
data.shape  #No duplicates detected

In [None]:
data.columns

In [None]:
#remove Unnamed: 0 column (Not useful for the model)
data.drop(["Unnamed: 0"], axis=1, inplace=True) 

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum() #No null values detected

In [None]:
data['addicted'].value_counts()
#No null values detected, and no bias detected

In [None]:
import plotly.express as px

#to show outliers for all numerical columns
for column in data.select_dtypes(include=['number']).columns:
	fig = px.box(data, x=column, title= f'Box plot for {column}')
	fig.show()

In [None]:
data.describe()

In [None]:
#Categoraize "daily_screen_time" column to show correlation with the target column "addicted"
def daily_screen_time_levels(col):
    if col<=4:
        return "Low"
    elif col<=8:
        return "Intermediate"
    else:
        return "High"

data["daily_screen_time_levels"] = data["daily_screen_time"].apply(daily_screen_time_levels)

In [None]:
#count plot daily_screen_time_levels
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='daily_screen_time_levels',data=data)#only take the x axis
plt.show()

In [None]:
#add the addicted hue
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='daily_screen_time_levels',data=data,hue="addicted")#only take the x axis
plt.show()

In [None]:
#show the percent of addicted people in each category to be more expressive
import matplotlib.pyplot as plt

# Group by screen time levels and addiction status
grouped = data.groupby(['daily_screen_time_levels', 'addicted'], observed=True).size().reset_index(name='count')

# Calculate percentages within each screen time level
total_per_bin = grouped.groupby('daily_screen_time_levels', observed=True)['count'].transform('sum')
grouped['percentage'] = grouped['count'] / total_per_bin * 100

# Pivot the data for stacked bar plot
pivot = grouped.pivot(index='daily_screen_time_levels', columns='addicted', values='percentage').fillna(0)

# Rename columns to readable labels
column_map = {0: 'Not Addicted', 1: 'Addicted'}
pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]

# Plot
pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
plt.title('Addiction Percentage by Daily Screen Time Level')
plt.xlabel('Daily Screen Time Level')
plt.ylabel('Percentage')
plt.legend(title='Addiction Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


From the chart above, We've concluded that almost there are no not-addicted people who used their mobile from 8 to 12 hours daily

In [None]:
#Categoraize "social_media_usage" column to show correlation with the target column "addicted"
def social_media_usage_levels(col):
    if col<=1:
        return "Low"
    elif col<=4:
        return "Intermediate"
    else:
        return "High"

data["social_media_usage_levels"] = data["social_media_usage"].apply(social_media_usage_levels)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='social_media_usage_levels',data=data)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='social_media_usage_levels',data=data,hue="addicted")
plt.show()

In [None]:
import matplotlib.pyplot as plt

grouped = data.groupby(['social_media_usage_levels', 'addicted'], observed=True).size().reset_index(name='count')

total_per_bin = grouped.groupby('social_media_usage_levels', observed=True)['count'].transform('sum')
grouped['percentage'] = grouped['count'] / total_per_bin * 100

pivot = grouped.pivot(index='social_media_usage_levels', columns='addicted', values='percentage').fillna(0)

column_map = {0: 'Not Addicted', 1: 'Addicted'}
pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]


pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
plt.title('Addiction Percentage by Social Media Usage Levels')
plt.xlabel('Media Usage Level')
plt.ylabel('Percentage')
plt.legend(title='Addiction Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


The more people use socail media, the more probably they will be addicted to phone usage.

In [None]:
#Categoraize "night_usage" column to show correlation with the target column "addicted"
def night_usage_levels(col):
    if col<=1:
        return "Low"
    elif col<=4:
        return "Intermediate"
    else:
        return "High"

data["night_usage_levels"] = data["night_usage"].apply(night_usage_levels)

In [None]:
import matplotlib.pyplot as plt

grouped = data.groupby(['night_usage_levels', 'addicted'], observed=True).size().reset_index(name='count')

total_per_bin = grouped.groupby('night_usage_levels', observed=True)['count'].transform('sum')
grouped['percentage'] = grouped['count'] / total_per_bin * 100

pivot = grouped.pivot(index='night_usage_levels', columns='addicted', values='percentage').fillna(0)


column_map = {0: 'Not Addicted', 1: 'Addicted'}
pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]

# Plot
pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
plt.title('Addiction Percentage by Night Usage Level')
plt.xlabel('Night Usage Level')
plt.ylabel('Percentage')
plt.legend(title='Addiction Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


Since "night_usage" column is the only one with 100% addiction in the high range, therefore this indicates that the night_usage has a great impact on addiction

In [None]:
#Categoraize "work_study_hours" column to show correlation with the target column "addicted"
def work_study_hours_levels(col):
    if col<=4:
        return "Low"
    elif col<=9:
        return "Intermediate"
    else:
        return "High"

data["work_study_hours_levels"] = data["work_study_hours"].apply(work_study_hours_levels)

In [None]:
import matplotlib.pyplot as plt

grouped = data.groupby(['work_study_hours_levels', 'addicted'], observed=True).size().reset_index(name='count')

total_per_bin = grouped.groupby('work_study_hours_levels', observed=True)['count'].transform('sum')
grouped['percentage'] = grouped['count'] / total_per_bin * 100

pivot = grouped.pivot(index='work_study_hours_levels', columns='addicted', values='percentage').fillna(0)

column_map = {0: 'Not Addicted', 1: 'Addicted'}
pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]

pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
plt.title('Addiction Percentage by Work_Study Hours Levels')
plt.xlabel('Work_Study Hours Level')
plt.ylabel('Percentage')
plt.legend(title='Addiction Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


The more work or study hours the person do, The less probably they will be addicted to phone usage and vice versa.

In [None]:
def age_categories(col):
    if col<=25:
        return "Gen-Z(15-25)"
    elif col<40:
        return "Gen-Y(26-40)"
    else:
        return "Gen-X(41-55)"

data["age_categories"] = data["age"].apply(age_categories)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='age_categories',data=data,hue="addicted")
plt.show()

In [None]:
import matplotlib.pyplot as plt

grouped = data.groupby(['age_categories', 'addicted'], observed=True).size().reset_index(name='count')

total_per_bin = grouped.groupby('age_categories', observed=True)['count'].transform('sum')
grouped['percentage'] = grouped['count'] / total_per_bin * 100

pivot = grouped.pivot(index='age_categories', columns='addicted', values='percentage').fillna(0)

column_map = {0: 'Not Addicted', 1: 'Addicted'}
pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]

pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
plt.title('Addiction Percentage by Age Categories')
plt.xlabel('Age Category')
plt.ylabel('Percentage')
plt.legend(title='Addiction Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

From the chart above, The percent of addiction increases significantly in Gen-Z which makes sense.

In [None]:
sns.kdeplot(data=data, x='gaming_time', hue='addicted', common_norm=False)

From the above KDE plot, It seems that the more gaming time the person spend, The more probably addicted they will be.

In [None]:
# Select only numeric columns
numeric_data = data.select_dtypes(include='number')

# Compute correlation matrix
corr_matrix = numeric_data.corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Create the heatmap
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})

# Title and layout
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(x='addicted', y='daily_screen_time', data=data)
plt.title('Daily Screen Time by Addiction Status')
plt.show()

Addicted people spend more time on their phones which makes sense.

In [None]:
data.info()

In [None]:
from scipy.stats import pointbiserialr
import pandas as pd

# Map 'addicted' column to numeric binary
data['addicted'] = data['addicted'].map({'not addicted': 0, 'addicted': 1})

# Identify numeric columns safely (excluding 'addicted')
numeric_cols = data.select_dtypes(include=['number']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'addicted']

# Dictionary to store correlation results
correlations = {}

# Compute point-biserial correlation
for col in numeric_cols:
    if data[col].nunique() > 1:
        subset = data[['addicted', col]].dropna()
        if len(subset) > 2:  # At least 3 valid rows needed
            corr, _ = pointbiserialr(subset['addicted'], subset[col])
            correlations[col] = corr

# Display sorted correlations
for col, val in sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True):
    print(f"{col}: {val:.2f}")


In [None]:
data.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cols = ['notifications', 'app_sessions', 'stress_level', 'age']
data[cols] = scaler.fit_transform(data[cols])
#scale the data before applying the formula
#Create the addiction score column using the most significant features
data['Addiction_Score'] = (
    0.79 * data['notifications'] +
    0.67 * data['app_sessions'] +
    0.54 * data['stress_level'] -
    0.37 * data['age']
)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
non_selected_features = ['social_media_usage', 'daily_screen_time', 'night_usage', 'work_study_hours','gaming_time','apps_installed']

data[non_selected_features] = scaler.fit_transform(data[non_selected_features])

In [None]:
data.describe()

In [None]:
from scipy.stats import pearsonr

# Compute correlation with the Addiction Score
correlations = {}
for col in non_selected_features:
    subset = data[['Addiction_Score', col]].dropna()
    if subset[col].nunique() > 1:
        corr, _ = pearsonr(subset['Addiction_Score'], subset[col])
        correlations[col] = corr

# Display results
for col, val in sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True):
    print(f"{col}: {val:.2f}")


The addiction score column makes sense because it shows a positive correlation with "social_media_usage","daily_screen_time","night_usage","gaming_time", and "apps_installed" columns. It also shows a negative correlation with "work_study_hours" column, So we can consider the "Addiction_Score" column as a good indicator for addiction.

In [None]:
data.head()

In [None]:
X=['daily_screen_time','app_sessions','social_media_usage',
   'gaming_time','notifications','night_usage','age',
   'work_study_hours','stress_level','apps_installed','addicted']

y=['Addiction_Score']

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[X], data[y], test_size = 0.25, random_state = 101)

In [None]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [None]:
from sklearn import metrics
print("MSE:",metrics.mean_squared_error(y_pred,y_test))
print("MAE:",metrics.mean_absolute_error(y_pred,y_test))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_pred,y_test)))
print("r2_score:",metrics.r2_score(y_pred,y_test))

In [None]:
from sklearn.linear_model import Ridge

# Ridge model with alpha (regularization strength)
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)

# Predictions
y_pred_ridge = ridge_reg.predict(X_test)

# Evaluation
print("Ridge Regression:")
print("MSE:", metrics.mean_squared_error(y_test, y_pred_ridge))
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_ridge))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridge)))
print("r2_score:", metrics.r2_score(y_test, y_pred_ridge))


In [None]:
from sklearn.linear_model import Lasso

# Lasso model with alpha
lasso_reg = Lasso(alpha=0.005)
lasso_reg.fit(X_train, y_train)

# Predictions
y_pred_lasso = lasso_reg.predict(X_test)

# Evaluation
print("\nLasso Regression:")
print("MSE:", metrics.mean_squared_error(y_test, y_pred_lasso))
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_lasso))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso)))
print("r2_score:", metrics.r2_score(y_test, y_pred_lasso))
