# Imports

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import Binarizer

# Use Dataset

In [None]:
df = pd.read_csv('dataset/mobile_addiction_data.csv')

df.head()

In [None]:

# Forma e datasetit
print("Number of rows::",df.shape[0])
print("Number of columns::",df.shape[1])
print("==================================================================")

# Llojet e atributeve
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("Atributet kategorike", categorical_columns)
print("Atributet numerike", numerical_columns)

In [None]:
#columns name formatting

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [None]:
print("General Statistics::\n")
print(df.info())

print("Summary Statistics::\n")
print(df.describe(include='all'))

In [None]:
# drop unnecessary columns
df = df.drop(columns=[
    'user_id',
    'primary_device_brand',
    'internet_connection_type',
    
], errors='ignore')

df.head()

In [None]:

# Vlerat e zbrazëta
print("Columns with Missing Values::", df.columns[df.isnull().any()].tolist())
print("Number of rows with Missing Values::", df.isnull().any(axis=1).sum())
print("Sample Indices with missing data::", df.isnull().any(axis=1).to_numpy().nonzero()[0].tolist()[0:10])

In [None]:
numeric_df = df.select_dtypes(include=[np.number])

negatives_mask = numeric_df < 0

negatives_count = negatives_mask.sum()

print("=== Negative Number Summary ===")
for col, count in negatives_count.items():
    if count > 0:
        print(f"{col}: {count} negative values")
    else:
        print(f"{col}: No negative values")

In [None]:
# show rows that contain any negative numeric values
rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

In [None]:
# replace all negative hours to 0
time_columns = [
    "daily_screen_time_hours",
    "phone_unlocks_per_day",
    "social_media_usage_hours",
    "gaming_usage_hours",
    "streaming_usage_hours",
    "messaging_usage_hours",
    "work_related_usage_hours",
    "sleep_hours",
    "physical_activity_hours",
    "time_spent_with_family_hours",
    "online_shopping_hours",
    "monthly_data_usage_gb",
    "push_notifications_per_day"
]

df[time_columns] = df[time_columns].clip(lower=0)

rows_with_negatives = df[negatives_mask.any(axis=1)]

rows_with_negatives.head()

In [None]:
# Select all rows that contains NaN value
nan_rows = df[df.isna().any(axis=1)]
print(df.isna().sum()[df.isna().sum() > 0])

print("=== Rows containing NaN values ===")
nan_rows.head()

In [None]:
# Check for duplicate rows 
duplicate_rows = df[df.duplicated()]

print(f"Number of duplicate rows found: {duplicate_rows.shape[0]}")


if duplicate_rows.shape[0] > 0:
    df = df.drop_duplicates()
    print(" Duplicate rows removed successfully.")
else:
    print(" No duplicate rows found.")


print(f"New dataset shape: {df.shape}")

In [None]:
#Typecasting categorical columns

df = df.apply(lambda col: col.map({'Yes': 1, 'No': 0}) if col.dropna().isin(['Yes', 'No']).all() else col)

df['has_children'] = df['has_children'].astype(bool)

df.head()

In [None]:
# handling categorical columns with one-hot encoding

education_type = { 'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4 , np.nan: -1 }
relationship_status_type = { 'Single': 1, 'In Relationship': 2, 'Married': 3, 'Divorced': 4, np.nan: -1 }
df['education_level'] = df['education_level'].replace("Master's", "Master")
df['education_level_encoded'] = df['education_level'].map(education_type)
df['relationship_status_encoded'] = df['relationship_status'].map(relationship_status_type)

df.head()

In [None]:
# Handle categorical data 1
urban_or_rural_type = {
    'Urban': 1,
    'Rural': 2,
    np.nan: -1
}

self_reported_addiction_level_type = {
    'Low': 1,
    'Moderate': 2,
    'High': 3,
    'Severe': 4,
    np.nan: -1
}

gender_type = {
    'Male': 1,
    'Female': 2,
    'Other': 3,
    np.nan: -1
}

df['urban_or_rural_encoded'] = df['urban_or_rural'].map(urban_or_rural_type)
df['self_reported_addiction_level_encoded'] = df['self_reported_addiction_level'].map(self_reported_addiction_level_type)
df['gender_encoded'] = df['gender'].map(gender_type)

df.head()

In [None]:
# Agregimi
# Create screen time bins
bins = [0, 2, 4, 6, 8, 24]
labels = ['0-2','2-4','4-6','6-8','8+']
df['screen_time_bin'] = pd.cut(df['daily_screen_time_hours'], bins=bins, labels=labels)

df.groupby('screen_time_bin')[['mental_health_score','depression_score','sleep_hours']].mean()

In [None]:
age_bins = [0, 25, 35, 45, 60, 100]
age_labels = ['<25','26-35','36-45','46-60','60+']
df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)


avg_screen_and_stress_by_age_gender = df.groupby(['age_group', 'gender']).agg({
    'daily_screen_time_hours': 'mean',
    'stress_level': 'mean'
}).round(2)

avg_social_and_stress_by_area_education = df.groupby(['urban_or_rural', 'education_level']).agg({
    'social_media_usage_hours': 'mean',
    'stress_level': 'mean'
}).round(2)

print("\033[1;32mMesatarja e kohes se shpenzuar ne ekran dhe nivelit te stresit:\033[0m")
print(avg_screen_and_stress_by_age_gender)

print("\n\033[1;32mMesatarja e përdorimit te mediave sociale dhe nivelit te stresit sipas zones dhe nivelit te arsimit:\033[0m")
print(avg_social_and_stress_by_area_education)


In [None]:
mental_health_by_activity = df.groupby(['physical_activity_hours']).agg({
    'mental_health_score': 'mean',
    'depression_score': 'mean',
    'anxiety_score': 'mean',
    'sleep_hours': 'mean'
}).round(2)

print("\033[1;32mMesatarja e shëndetit mendor sipas aktivitetit fizik:\033[0m")
print(mental_health_by_activity.head(10))  

# --- 2. Addiction by gender and occupation ---
addiction_by_gender_occupation = df.groupby(['gender', 'occupation']).agg({
    'daily_screen_time_hours': 'mean',
    'phone_unlocks_per_day': 'mean',
    'stress_level': 'mean'
}).round(2)


print("\n\033[1;32mVarësia nga telefoni sipas gjinisë dhe profesionit:\033[0m")
print(addiction_by_gender_occupation.head(10))

# --- 3. Notifications and stress ---
# notifications_vs_stress = df.groupby(
#     pd.cut(df['push_notifications_per_day'], 
#            bins=[0,50,100,150,200,500], 
#            labels=['<50','50-100','100-150','150-200','200+'])
# ).agg({
#     'stress_level': 'mean',
#     'sleep_hours': 'mean'
# }).round(2)


# print("\n\033[1;32mNiveli i stresit dhe gjumit sipas numrit të njoftimeve:\033[0m")
# print(notifications_vs_stress.head(10))

# --- 4. Income vs tech engagement ---
# income_vs_usage = df.groupby(
#     pd.cut(df['income_usd'], 
#            bins=[0,20000,40000,60000,80000,100000,200000], 
#            labels=['<20k','20-40k','40-60k','60-80k','80-100k','100k+'])
# ).agg({
#     'daily_screen_time_hours': 'mean',
#     'monthly_data_usage_gb': 'mean',
#     'stress_level': 'mean'
# }).round(2)


# # --- PRINT RESULTS ---



# print("\n\033[1;32mKrahasimi i të ardhurave me përdorimin teknologjik:\033[0m")
# print(income_vs_usage.head(10))

In [None]:
# Mostra e të dhënave

sample_data = df.sample(frac=0.2, random_state=42)

numeric_cols = ['age', 'income_usd', 'urban_or_rural', 'stress_level', 'daily_screen_time_hours']

for col in numeric_cols:
    plt.figure(figsize=(10,5))
    sns.histplot(df[col], color='blue', label='Full Dataset', kde=True, stat="density", alpha=0.5)
    sns.histplot(sample_data[col], color='orange', label='Sample', kde=True, stat="density", alpha=0.5)
    plt.title(f'Distribution Comparison: {col}')
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

In [None]:
# Trajtimi i outliers me IQR

Q1 = df["income_usd"].quantile(0.25)
Q2 = df["income_usd"].quantile(0.50)
Q3 = df["income_usd"].quantile(0.75)
Q4 = df["income_usd"].max()

IQR = Q3 - Q1

print(f"Q1 (25th percentile): {Q1}")
print(f"Q2 (Median): {Q2}")
print(f"Q3 (75th percentile): {Q3}")
print(f"Q4 (Max): {Q4}")
print(f"IQR (Q3 - Q1): {IQR}")

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound: ", lower_bound)
print("Upper Bound: ", upper_bound)

outliers = df[(df["income_usd"] < lower_bound) | (df["income_usd"] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

sum_above_upper = df.loc[df["income_usd"] > upper_bound, "income_usd"].sum()

count_above_upper = (df["income_usd"] > upper_bound).sum()
count_under_upper = (df["income_usd"] < lower_bound).sum()

print(f"Count of values above upper bound: {count_above_upper}")
print(f"Count of values above upper bound: {count_under_upper}")

# Remove outliers from the 'income_usd' column
df_no_outliers = df[(df["income_usd"] >= lower_bound) & (df["income_usd"] <= upper_bound)]

print(f"Original dataset size: {len(df)}")
print(f"Dataset size after removing outliers: {len(df_no_outliers)}")

df_no_outliers.describe()

In [None]:
#zgjedhja e nën bashkësisë së vetive
target_column = "self_reported_addiction_level"
X = df.drop(columns=[target_column, "self_reported_addiction_level_encoded"], errors='ignore')

y = df[target_column]

le = LabelEncoder()
y_encoded = le.fit_transform(y)


X_encoded = pd.get_dummies(X, drop_first=True)


X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce')
X_encoded = X_encoded.fillna(0)

# 4️ Correlation with target
corr = X_encoded.corrwith(pd.Series(y_encoded))
plt.figure(figsize=(12,5))
corr.sort_values(ascending=False).plot(kind='bar')
plt.title("Feature correlation with target")
plt.show()

# 5️ Feature selection using SelectKBest (f_classif)
selector = SelectKBest(score_func=f_classif, k=5)  # select top 5 features
X_new = selector.fit_transform(X_encoded, y_encoded)
top_features_f = X_encoded.columns[selector.get_support()]
#print("Top 5 features (SelectKBest f_classif):", top_features_f.tolist())

# 6️ Recursive Feature Elimination (RFE) with RandomForest
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_encoded, y_encoded)
top_features_rfe = X_encoded.columns[rfe.support_]
print("Top 5 features (RFE with RandomForest):", top_features_rfe.tolist())

In [None]:
# Diskretizimi i 'daily_screen_time_hours' ne kategori
bins = [0, 2, 5, 8, np.inf]
labels = ['Low', 'Medium', 'High', 'Very High']
df['Screen_Time_Category'] = pd.cut(df['daily_screen_time_hours'], bins=bins, labels=labels)
print("\n Diskretizimi u aplikua mbi kolonën 'Daily_Screen_Time_Hours'")
print(df[['daily_screen_time_hours', 'Screen_Time_Category']].head())




In [None]:
#Binarizimi
binarize_cols = [
    'gender', 'relationship_status', 'urban_or_rural', 'has_children'
]

binarized_dfs = []

for col in binarize_cols:
    if col in df.columns:
        
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)

        dummies = pd.get_dummies(df[col], prefix=col, dtype=int)  
        df = pd.concat([df, dummies], axis=1)
        binarized_dfs.append(dummies)
        print(f" U krye binarizimi për kolonën: {col}")
    else:
        print(f"Kolona '{col}' nuk ekziston — u anashkalua.")

if binarized_dfs:
    binarized_result = pd.concat(binarized_dfs, axis=1)
    print("\n--- Kolonat e binarizuara (me 0 dhe 1) ---")
    print(binarized_result.head(5))
else:
    print("\n Asnjë kolonë nuk u binarizua — kontrollo emrat e kolonave.")

df.head()


In [None]:
#Krijimi i vetive të reja
# Sa orë kalon përdoruesi në aktivitete argëtuese
df['Total_Entertainment_Hours'] = (
    df['social_media_usage_hours'] +
    df['gaming_usage_hours'] +
    df['streaming_usage_hours'] +
    df['messaging_usage_hours']
)

# Indeksi i shëndetit mendor
df['Overall_Mental_Health_Index'] = (
    df['mental_health_score'] -
    (df['stress_level'] + df['depression_score'] + df['anxiety_score']) / 3
)

print(" U krijuan vetitë e reja!")
print(df[['Total_Entertainment_Hours', 'Overall_Mental_Health_Index']].head())


In [None]:
df.rename(columns={'urban_or_rural_Rural': 'is_rural'}, inplace=True)
df.rename(columns={'urban_or_rural_Urban': 'is_urban'}, inplace=True)
df["is_rural"] = df["is_rural"].astype(bool)
df["is_urban"] = df["is_urban"].astype(bool)
df["has_children_0"] = df["has_children_0"].astype(bool)
df["has_children_1"] = df["has_children_1"].astype(bool)