# Imports

In [1]:
import pandas as pd
import numpy as np

# Use Dataset

In [2]:
df = pd.read_csv('dataset/mobile_addiction_data.csv')

df.head()

Unnamed: 0,User_ID,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,...,Online_Shopping_Hours,Internet_Connection_Type,Primary_Device_Brand,Has_Screen_Time_Management_App,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Has_Night_Mode_On,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
0,1,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,...,1.85,5G,Other,No,Low,16.43,Yes,16,106,39.36
1,2,UK,26,Male,Artist,Master's,41868.19,9.05,61,3.13,...,0.66,4G,Samsung,Yes,Severe,32.87,No,12,111,9.45
2,3,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,...,-0.14,WiFi,Samsung,Yes,Severe,27.45,No,18,90,50.27
3,4,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,...,0.17,3G,Apple,Yes,Moderate,30.85,No,17,60,30.82
4,5,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,...,0.58,3G,Xiaomi,No,High,10.38,Yes,18,127,21.7


In [3]:
print("Number of rows::",df.shape[0])
print("Number of columns::",df.shape[1])
print ("Column Names &  Data Types::\n",df.dtypes)

Number of rows:: 3000
Number of columns:: 34
Column Names &  Data Types::
 User_ID                             int64
Country                            object
Age                                 int64
Gender                             object
Occupation                         object
Education_Level                    object
Income_USD                        float64
Daily_Screen_Time_Hours           float64
Phone_Unlocks_Per_Day               int64
Social_Media_Usage_Hours          float64
Gaming_Usage_Hours                float64
Streaming_Usage_Hours             float64
Messaging_Usage_Hours             float64
Work_Related_Usage_Hours          float64
Sleep_Hours                       float64
Physical_Activity_Hours           float64
Mental_Health_Score               float64
Depression_Score                  float64
Anxiety_Score                     float64
Stress_Level                      float64
Relationship_Status                object
Has_Children                       object
U

In [4]:
print("Columns with Missing Values::", df.columns[df.isnull().any()].tolist())
print("Number of rows with Missing Values::", df.isnull().any(axis=1).sum())
print("Sample Indices with missing data::", df.isnull().any(axis=1).to_numpy().nonzero()[0].tolist()[0:10])

Columns with Missing Values:: ['Education_Level']
Number of rows with Missing Values:: 612
Sample Indices with missing data:: [3, 4, 8, 16, 18, 40, 44, 45, 55, 66]


In [5]:
#columns name formatting

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [6]:
print("General Statistics::\n")
print(df.info())

print("Summary Statistics::\n")
print(df.describe(include='all'))

General Statistics::

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         3000 non-null   int64  
 1   country                         3000 non-null   object 
 2   age                             3000 non-null   int64  
 3   gender                          3000 non-null   object 
 4   occupation                      3000 non-null   object 
 5   education_level                 2388 non-null   object 
 6   income_usd                      3000 non-null   float64
 7   daily_screen_time_hours         3000 non-null   float64
 8   phone_unlocks_per_day           3000 non-null   int64  
 9   social_media_usage_hours        3000 non-null   float64
 10  gaming_usage_hours              3000 non-null   float64
 11  streaming_usage_hours           3000 non-null   float64
 12  messaging_us

In [7]:
# drop unnecessary columns
df = df.drop(columns=[
    'user_id',
    'primary_device_brand',
    'internet_connection_type',
    'has_screen_time_management_app',
    'has_night_mode_on'
], errors='ignore')

df.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,relationship_status,has_children,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,-1.55,...,Single,No,Rural,1.7,1.85,Low,16.43,16,106,39.36
1,UK,26,Male,Artist,Master's,41868.19,9.05,61,3.13,2.5,...,In Relationship,No,Rural,0.9,0.66,Severe,32.87,12,111,9.45
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Single,Yes,Urban,1.69,-0.14,Severe,27.45,18,90,50.27
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,In Relationship,No,Urban,1.11,0.17,Moderate,30.85,17,60,30.82
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7


In [8]:
numeric_df = df.select_dtypes(include=[np.number])

negatives_mask = numeric_df < 0

negatives_count = negatives_mask.sum()

print("=== Negative Number Summary ===")
for col, count in negatives_count.items():
    if count > 0:
        print(f"{col}: {count} negative values")
    else:
        print(f"{col}: No negative values")

=== Negative Number Summary ===
age: No negative values
income_usd: 81 negative values
daily_screen_time_hours: 9 negative values
phone_unlocks_per_day: 1 negative values
social_media_usage_hours: 70 negative values
gaming_usage_hours: 213 negative values
streaming_usage_hours: 67 negative values
messaging_usage_hours: 2 negative values
work_related_usage_hours: 53 negative values
sleep_hours: No negative values
physical_activity_hours: 80 negative values
mental_health_score: No negative values
depression_score: No negative values
anxiety_score: No negative values
stress_level: No negative values
time_spent_with_family_hours: 96 negative values
online_shopping_hours: 181 negative values
monthly_data_usage_gb: 16 negative values
age_first_phone: No negative values
push_notifications_per_day: 3 negative values
tech_savviness_score: No negative values


In [9]:
# show rows that contain any negative numeric values
rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,relationship_status,has_children,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,-1.55,...,Single,No,Rural,1.7,1.85,Low,16.43,16,106,39.36
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Single,Yes,Urban,1.69,-0.14,Severe,27.45,18,90,50.27
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7
12,USA,18,Female,Engineer,Bachelor's,16430.69,5.01,75,1.12,0.69,...,Divorced,Yes,Urban,2.16,-0.23,Severe,29.76,8,112,37.84
17,Mexico,18,Male,Doctor,PhD,40513.1,6.06,68,1.38,-0.52,...,In Relationship,No,Urban,2.33,0.9,Severe,31.2,13,12,47.86


In [10]:
# replace all negative hours to 0
time_columns = [
    "daily_screen_time_hours",
    "phone_unlocks_per_day",
    "social_media_usage_hours",
    "gaming_usage_hours",
    "streaming_usage_hours",
    "messaging_usage_hours",
    "work_related_usage_hours",
    "sleep_hours",
    "physical_activity_hours",
    "time_spent_with_family_hours",
    "online_shopping_hours",
    "monthly_data_usage_gb",
    "push_notifications_per_day"
]

df[time_columns] = df[time_columns].clip(lower=0)

rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,relationship_status,has_children,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,0.0,...,Single,No,Rural,1.7,1.85,Low,16.43,16,106,39.36
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Single,Yes,Urban,1.69,0.0,Severe,27.45,18,90,50.27
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7
12,USA,18,Female,Engineer,Bachelor's,16430.69,5.01,75,1.12,0.69,...,Divorced,Yes,Urban,2.16,0.0,Severe,29.76,8,112,37.84
17,Mexico,18,Male,Doctor,PhD,40513.1,6.06,68,1.38,0.0,...,In Relationship,No,Urban,2.33,0.9,Severe,31.2,13,12,47.86


In [11]:
# Select all rows that contains NaN value
nan_rows = df[df.isna().any(axis=1)]

print("=== Rows containing NaN values ===")
nan_rows.head()

=== Rows containing NaN values ===


Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,relationship_status,has_children,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,In Relationship,No,Urban,1.11,0.17,Moderate,30.85,17,60,30.82
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7
8,Nigeria,33,Other,Salesperson,,26629.11,7.45,50,2.02,0.49,...,Divorced,No,Urban,0.77,0.1,High,24.49,8,85,31.52
16,USA,51,Other,Manager,,41764.15,8.19,47,1.29,1.72,...,In Relationship,Yes,Rural,1.64,0.37,Moderate,23.55,12,149,10.65
18,USA,33,Male,Manager,,6521.03,5.79,47,0.0,0.34,...,Married,Yes,Rural,0.0,0.77,Severe,41.45,11,99,14.92


In [12]:
# Check for duplicate rows 
duplicate_rows = df[df.duplicated()]

print(f"Number of duplicate rows found: {duplicate_rows.shape[0]}")


if duplicate_rows.shape[0] > 0:
    df = df.drop_duplicates()
    print(" Duplicate rows removed successfully.")
else:
    print(" No duplicate rows found.")


print(f"New dataset shape: {df.shape}")

Number of duplicate rows found: 0
 No duplicate rows found.
New dataset shape: (3000, 29)


In [13]:
#Typecasting categorical columns

df = df.apply(lambda col: col.map({'Yes': 1, 'No': 0}) if col.dropna().isin(['Yes', 'No']).all() else col)

df['has_children'] = df['has_children'].astype(bool)

df.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,relationship_status,has_children,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,0.0,...,Single,False,Rural,1.7,1.85,Low,16.43,16,106,39.36
1,UK,26,Male,Artist,Master's,41868.19,9.05,61,3.13,2.5,...,In Relationship,False,Rural,0.9,0.66,Severe,32.87,12,111,9.45
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Single,True,Urban,1.69,0.0,Severe,27.45,18,90,50.27
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,In Relationship,False,Urban,1.11,0.17,Moderate,30.85,17,60,30.82
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,False,Urban,1.06,0.58,High,10.38,18,127,21.7


In [14]:
# handling categorical columns with one-hot encoding

education_type = { 'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4 , np.nan: -1 }
relationship_status_type = { 'Single': 1, 'In Relationship': 2, 'Married': 3, 'Divorced': 4, np.nan: -1 }
df['education_level'] = df['education_level'].replace("Master's", "Master")
df['education_level_encoded'] = df['education_level'].map(education_type)
df['relationship_status_encoded'] = df['relationship_status'].map(relationship_status_type)

df.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,urban_or_rural,time_spent_with_family_hours,online_shopping_hours,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score,education_level_encoded,relationship_status_encoded
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,0.0,...,Rural,1.7,1.85,Low,16.43,16,106,39.36,1.0,1
1,UK,26,Male,Artist,Master,41868.19,9.05,61,3.13,2.5,...,Rural,0.9,0.66,Severe,32.87,12,111,9.45,3.0,2
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Urban,1.69,0.0,Severe,27.45,18,90,50.27,1.0,1
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,Urban,1.11,0.17,Moderate,30.85,17,60,30.82,-1.0,2
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Urban,1.06,0.58,High,10.38,18,127,21.7,-1.0,4


In [16]:
# Handle categorical data 1
urban_or_rural_type = {
    'Urban': 1,
    'Rural': 2,
    np.nan: -1
}

self_reported_addiction_level_type = {
    'Low': 1,
    'Moderate': 2,
    'High': 3,
    'Severe': 4,
    np.nan: -1
}

gender_type = {
    'Male': 1,
    'Female': 2,
    'Other': 3,
    np.nan: -1
}

df['urban_or_rural_encoded'] = df['urban_or_rural'].map(urban_or_rural_type)
df['self_reported_addiction_level_encoded'] = df['self_reported_addiction_level'].map(self_reported_addiction_level_type)
df['gender_encoded'] = df['gender'].map(gender_type)

df.head()

Unnamed: 0,country,age,gender,occupation,education_level,income_usd,daily_screen_time_hours,phone_unlocks_per_day,social_media_usage_hours,gaming_usage_hours,...,self_reported_addiction_level,monthly_data_usage_gb,age_first_phone,push_notifications_per_day,tech_savviness_score,education_level_encoded,relationship_status_encoded,urban_or_rural_encoded,self_reported_addiction_level_encoded,gender_encoded
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,0.0,...,Low,16.43,16,106,39.36,1.0,1,2,1,1
1,UK,26,Male,Artist,Master,41868.19,9.05,61,3.13,2.5,...,Severe,32.87,12,111,9.45,3.0,2,2,4,1
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Severe,27.45,18,90,50.27,1.0,1,1,4,3
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,Moderate,30.85,17,60,30.82,-1.0,2,1,2,2
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,High,10.38,18,127,21.7,-1.0,4,1,3,3
