# Imports

In [1]:
import pandas as pd
import numpy as np

# Use Dataset

In [2]:
df = pd.read_csv('dataset/mobile_addiction_data.csv')

df.head()

Unnamed: 0,User_ID,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,...,Online_Shopping_Hours,Internet_Connection_Type,Primary_Device_Brand,Has_Screen_Time_Management_App,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Has_Night_Mode_On,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
0,1,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,...,1.85,5G,Other,No,Low,16.43,Yes,16,106,39.36
1,2,UK,26,Male,Artist,Master's,41868.19,9.05,61,3.13,...,0.66,4G,Samsung,Yes,Severe,32.87,No,12,111,9.45
2,3,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,...,-0.14,WiFi,Samsung,Yes,Severe,27.45,No,18,90,50.27
3,4,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,...,0.17,3G,Apple,Yes,Moderate,30.85,No,17,60,30.82
4,5,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,...,0.58,3G,Xiaomi,No,High,10.38,Yes,18,127,21.7


In [35]:
# drop unnecessary columns
df = df.drop(columns=[
    'User_ID',
    'Primary_Device_Brand',
    'Internet_Connection_Type',
    'Has_Screen_Time_Management_App',
    'Has_Night_Mode_On'
], errors='ignore')

df.head()

Unnamed: 0,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,Gaming_Usage_Hours,...,Relationship_Status,Has_Children,Urban_or_Rural,Time_Spent_With_Family_Hours,Online_Shopping_Hours,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
0,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,-1.55,...,Single,No,Rural,1.7,1.85,Low,16.43,16,106,39.36
1,UK,26,Male,Artist,Master's,41868.19,9.05,61,3.13,2.5,...,In Relationship,No,Rural,0.9,0.66,Severe,32.87,12,111,9.45
2,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,1.96,...,Single,Yes,Urban,1.69,-0.14,Severe,27.45,18,90,50.27
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,In Relationship,No,Urban,1.11,0.17,Moderate,30.85,17,60,30.82
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7


In [3]:
numeric_df = df.select_dtypes(include=[np.number])

negatives_mask = numeric_df < 0

negatives_count = negatives_mask.sum()

print("=== Negative Number Summary ===")
for col, count in negatives_count.items():
    if count > 0:
        print(f"{col}: {count} negative values")
    else:
        print(f"{col}: No negative values")

=== Negative Number Summary ===
User_ID: No negative values
Age: No negative values
Income_USD: 81 negative values
Daily_Screen_Time_Hours: 9 negative values
Phone_Unlocks_Per_Day: 1 negative values
Social_Media_Usage_Hours: 70 negative values
Gaming_Usage_Hours: 213 negative values
Streaming_Usage_Hours: 67 negative values
Messaging_Usage_Hours: 2 negative values
Work_Related_Usage_Hours: 53 negative values
Sleep_Hours: No negative values
Physical_Activity_Hours: 80 negative values
Mental_Health_Score: No negative values
Depression_Score: No negative values
Anxiety_Score: No negative values
Stress_Level: No negative values
Time_Spent_With_Family_Hours: 96 negative values
Online_Shopping_Hours: 181 negative values
Monthly_Data_Usage_GB: 16 negative values
Age_First_Phone: No negative values
Push_Notifications_Per_Day: 3 negative values
Tech_Savviness_Score: No negative values


In [4]:
# show rows that contain any negative numeric values
rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

Unnamed: 0,User_ID,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,...,Online_Shopping_Hours,Internet_Connection_Type,Primary_Device_Brand,Has_Screen_Time_Management_App,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Has_Night_Mode_On,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
0,1,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,...,1.85,5G,Other,No,Low,16.43,Yes,16,106,39.36
2,3,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,...,-0.14,WiFi,Samsung,Yes,Severe,27.45,No,18,90,50.27
4,5,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,...,0.58,3G,Xiaomi,No,High,10.38,Yes,18,127,21.7
12,13,USA,18,Female,Engineer,Bachelor's,16430.69,5.01,75,1.12,...,-0.23,4G,Samsung,Yes,Severe,29.76,Yes,8,112,37.84
17,18,Mexico,18,Male,Doctor,PhD,40513.1,6.06,68,1.38,...,0.9,4G,Xiaomi,No,Severe,31.2,No,13,12,47.86


In [5]:
# replace all negative hours to 0
time_columns = [
    "Daily_Screen_Time_Hours",
    "Phone_Unlocks_Per_Day",
    "Social_Media_Usage_Hours",
    "Gaming_Usage_Hours",
    "Streaming_Usage_Hours",
    "Messaging_Usage_Hours",
    "Work_Related_Usage_Hours",
    "Sleep_Hours",
    "Physical_Activity_Hours",
    "Time_Spent_With_Family_Hours",
    "Online_Shopping_Hours",
    "Monthly_Data_Usage_GB",
    "Push_Notifications_Per_Day"
]

df[time_columns] = df[time_columns].clip(lower=0)

rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

Unnamed: 0,User_ID,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,...,Online_Shopping_Hours,Internet_Connection_Type,Primary_Device_Brand,Has_Screen_Time_Management_App,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Has_Night_Mode_On,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
0,1,India,32,Male,Salesperson,High School,43865.49,5.81,75,0.84,...,1.85,5G,Other,No,Low,16.43,Yes,16,106,39.36
2,3,Germany,70,Other,Doctor,High School,59636.51,5.76,58,2.12,...,0.0,WiFi,Samsung,Yes,Severe,27.45,No,18,90,50.27
4,5,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,...,0.58,3G,Xiaomi,No,High,10.38,Yes,18,127,21.7
12,13,USA,18,Female,Engineer,Bachelor's,16430.69,5.01,75,1.12,...,0.0,4G,Samsung,Yes,Severe,29.76,Yes,8,112,37.84
17,18,Mexico,18,Male,Doctor,PhD,40513.1,6.06,68,1.38,...,0.9,4G,Xiaomi,No,Severe,31.2,No,13,12,47.86


In [39]:
# Select all rows that contains NaN value
nan_rows = df[df.isna().any(axis=1)]

print("=== Rows containing NaN values ===")
nan_rows.head()

=== Rows containing NaN values ===


Unnamed: 0,Country,Age,Gender,Occupation,Education_Level,Income_USD,Daily_Screen_Time_Hours,Phone_Unlocks_Per_Day,Social_Media_Usage_Hours,Gaming_Usage_Hours,...,Relationship_Status,Has_Children,Urban_or_Rural,Time_Spent_With_Family_Hours,Online_Shopping_Hours,Self_Reported_Addiction_Level,Monthly_Data_Usage_GB,Age_First_Phone,Push_Notifications_Per_Day,Tech_Savviness_Score
3,UK,44,Female,Engineer,,39022.07,6.71,80,1.6,2.7,...,In Relationship,No,Urban,1.11,0.17,Moderate,30.85,17,60,30.82
4,Brazil,46,Other,Student,,-783.98,6.31,136,1.51,1.73,...,Divorced,No,Urban,1.06,0.58,High,10.38,18,127,21.7
8,Nigeria,33,Other,Salesperson,,26629.11,7.45,50,2.02,0.49,...,Divorced,No,Urban,0.77,0.1,High,24.49,8,85,31.52
16,USA,51,Other,Manager,,41764.15,8.19,47,1.29,1.72,...,In Relationship,Yes,Rural,1.64,0.37,Moderate,23.55,12,149,10.65
18,USA,33,Male,Manager,,6521.03,5.79,47,0.0,0.34,...,Married,Yes,Rural,0.0,0.77,Severe,41.45,11,99,14.92


In [8]:
# Check for duplicate rows 
duplicate_rows = df[df.duplicated()]

print(f"Number of duplicate rows found: {duplicate_rows.shape[0]}")


if duplicate_rows.shape[0] > 0:
    df = df.drop_duplicates()
    print(" Duplicate rows removed successfully.")
else:
    print(" No duplicate rows found.")


print(f"New dataset shape: {df.shape}")


Number of duplicate rows found: 0
 No duplicate rows found.
New dataset shape: (3000, 34)
