In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer


In [None]:
df = pd.read_csv('dataset/mobile_addiction_data_processed.csv')


In [None]:
#zgjedhja e nën bashkësisë së vetive
target_column = "self_reported_addiction_level"
X = df.drop(columns=[target_column, "self_reported_addiction_level_encoded"], errors='ignore')

y = df[target_column]

le = LabelEncoder()
y_encoded = le.fit_transform(y)


X_encoded = pd.get_dummies(X, drop_first=True)


X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce')
X_encoded = X_encoded.fillna(0)

# 4️ Correlation with target
corr = X_encoded.corrwith(pd.Series(y_encoded))
plt.figure(figsize=(12,5))
corr.sort_values(ascending=False).plot(kind='bar')
plt.title("Feature correlation with target")
plt.show()

# 5️ Feature selection using SelectKBest (f_classif)
selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X_encoded, y_encoded)
top_features_f = X_encoded.columns[selector.get_support()]

# 6️ Recursive Feature Elimination (RFE) with RandomForest
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_encoded, y_encoded)
top_features_rfe = X_encoded.columns[rfe.support_]
print("Top 5 features (RFE with RandomForest):", top_features_rfe.tolist())

X_minmax = X_encoded.copy()
X_zscore = X_encoded.copy()
X_decimal = X_encoded.copy()

v = 73600
col = 'income_usd'

print("\n1. Normalizimi Min-Max  v' = (v-min)/(max-min)")
# derive numeric columns from X_encoded (not from the original df) to avoid KeyError
numerical_cols = X_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()

# safety: keep only columns that actually exist in X_encoded
numerical_cols = [c for c in numerical_cols if c in X_encoded.columns]

minmax_ct = ColumnTransformer(
    [(c, MinMaxScaler(), [c]) for c in numerical_cols],
    remainder='passthrough'
)

# fit/transform only the numeric columns that exist in X_encoded
X_minmax[numerical_cols] = minmax_ct.fit_transform(X_encoded[numerical_cols])

if col in X_encoded.columns:
    min_v = X_encoded[col].min()
    max_v = X_encoded[col].max()
    manual = (v - min_v) / (max_v - min_v)
    print(f"   Manuale  ${v:,} → {manual:.4f}")

    match = X_minmax.loc[X_encoded[col] == v, col]
    if not match.empty:
        print(f"   Sklearn (saktë) ${v:,} → {match.iloc[0]:.4f}")
    else:
        scaler_for_col = minmax_ct.named_transformers_[col]
        approx = scaler_for_col.transform([[v]])[0][0]
        print(f"   Sklearn (përafërsim) ${v:,} → {approx:.4f}")

print("\n2. Normalizimi Z-Score  v' = (v-μ)/σ")

z_ct = ColumnTransformer(
    [(c, StandardScaler(), [c]) for c in numerical_cols],
    remainder='passthrough'
)

X_zscore[numerical_cols] = z_ct.fit_transform(X_encoded[numerical_cols])

if col in X_encoded.columns:
    mu    = X_encoded[col].mean()
    sigma = X_encoded[col].std()
    manual_z = (v - mu) / sigma
    print(f"   Manuale  ${v:,} → z = ({v}-{mu:,.0f}) / {sigma:,.0f} = {manual_z:.4f}")

    match_z = X_zscore.loc[X_encoded[col] == v, col]
    if not match_z.empty:
        print(f"   Sklearn (saktë) ${v:,} → {match_z.iloc[0]:.4f}")
    else:
        scaler_z = z_ct.named_transformers_[col]
        approx_z = scaler_z.transform([[v]])[0][0]
        print(f"   Sklearn (përafërsim) ${v:,} → {approx_z:.4f}")

print("\n3. Shkallëzimi Dhjetor  v' = v / 10^j  (max |v'| < 1)")
for c in numerical_cols:
    max_abs = X_encoded[c].abs().max()
    j = 0
    while max_abs >= 1 and j < 20:
        max_abs /= 10
        j += 1
    if j:
        X_decimal[c] = X_encoded[c] / (10 ** j)
        print(f"   {c}: ÷ 10^{j}  → max abs = {X_decimal[c].abs().max():.4f}")


X_normalized = X_minmax.copy()
print("\nTë dhënat finale të normalizuara → X_normalized (Min-Max)")

if col in X_encoded.columns:
    fig, ax = plt.subplots(1, 3, figsize=(15, 4))
    X_encoded[col].hist(ax=ax[0], bins=30, color='skyblue', edgecolor='black')
    ax[0].set_title('Të ardhurat origjinale')
    X_minmax[col].hist(ax=ax[1], bins=30, color='lightgreen', edgecolor='black')
    ax[1].set_title('Min-Max [0,1]')
    X_zscore[col].hist(ax=ax[2], bins=30, color='salmon', edgecolor='black')
    ax[2].set_title('Z-Score')
    plt.tight_layout()
    plt.show()

X_encoded = X_normalized.copy()
print(f"\nTransformimi përfundoi! Forma e X_encoded: {X_encoded.shape}")

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
from sklearn.svm import OneClassSVM


print("\n" + "="*70)
print("    DETEKTIMI I PËRGJITHSHËM I ANOMALIVE (GENERAL ANOMALY DETECTION)   ")
print("    Metodat: Statistical, Proximity, Density, Clustering, SVM          ")
print("="*70 + "\n")

numerical_cols = [
    'age', 'income_usd', 'daily_screen_time_hours', 'phone_unlocks_per_day',
    'social_media_usage_hours', 'gaming_usage_hours', 'streaming_usage_hours',
    'messaging_usage_hours', 'work_related_usage_hours', 'sleep_hours',
    'physical_activity_hours', 'mental_health_score', 'depression_score',
    'anxiety_score', 'stress_level', 'time_spent_with_family_hours',
    'online_shopping_hours', 'monthly_data_usage_gb', 'age_first_phone',
    'push_notifications_per_day', 'tech_savviness_score',
    'Total_Entertainment_Hours', 'Overall_Mental_Health_Index'
]

# 1. Përgatitja e të dhënave (Marrim X_zscore që është i standardizuar)
# Sigurohemi që po punojmë vetëm me kolona numerike ekzistuese
available_num_cols = [c for c in numerical_cols if c in X_zscore.columns]
X_anomaly = X_zscore[available_num_cols].copy()

print(f"Duke analizuar {X_anomaly.shape[1]} kolona numerike për {X_anomaly.shape[0]} rreshta.\n")

# =============================================================================
# DEFINIMI I FUNKSIONEVE (Për përdorim gjeneral)
# =============================================================================

def detect_statistical_zscore(data, threshold=3.0):
    """
    Sllajdet 3-6: Pika është outlier nëse vlera absolute e Z-score > threshold.
    Threshold tipik: 3 (shumë i rrallë), 2.5 (mesatar).
    """
    z_scores = np.abs(data)
    # Konsiderojmë outlier nëse të paktën 1 kolonë kalon pragun
    return (z_scores > threshold).any(axis=1)

def detect_proximity_knn(data, k=5, percentile=95):
    """
    Sllajdet 12-14: Outlier është ai që ka distancë të madhe nga k-fqinjët.
    Percentile: Përcakton pragun (p.sh. 95% e të dhënave janë normale).
    """
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(data)
    distances, _ = knn.kneighbors(data)
    avg_distances = distances.mean(axis=1)
    threshold = np.percentile(avg_distances, percentile)
    return avg_distances > threshold

def detect_density_lof(data, contamination=0.05, neighbors=20):
    """
    Sllajdet 15-18: LOF krahason dendësinë lokale.
    Contamination: Përqindja e pritur e anomalive (p.sh. 0.05 = 5%).
    """
    lof = LocalOutlierFactor(n_neighbors=neighbors, contamination=contamination)
    y_pred = lof.fit_predict(data)
    return y_pred == -1  # -1 janë outliers

def detect_clustering_kmeans(data, n_clusters=5, percentile=95):
    """
    Sllajdet 19-23: Outliers janë pikat larg qendrës së klusterit (centroidit).
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(data)
    X_dist = kmeans.transform(data)
    min_dist = np.min(X_dist, axis=1) # Distanca nga qendra më e afërt
    threshold = np.percentile(min_dist, percentile)
    return min_dist > threshold

def detect_svm(data, nu=0.05):
    """
    Sllajdet 24-25: One-Class SVM mëson kufirin e të dhënave normale.
    Nu: I ngjashëm me contamination (kufiri i sipërm i fraksionit të gabimeve).
    """
    svm = OneClassSVM(kernel='rbf', gamma='scale', nu=nu)
    y_pred = svm.fit_predict(data)
    return y_pred == -1

# =============================================================================
# EKZEKUTIMI I ANALIZËS (Me parametra të ndryshueshëm)
# =============================================================================

# Këtu mund të ndryshosh "ashpërsinë" e detektimit
CONTAMINATION_LEVEL = 0.05  # 5% e të dhënave supozohet të jenë anomali
Z_SCORE_THRESHOLD = 3.0     # Standardi statistikor

print(f"--- Parametrat e Përgjithshëm ---")
print(f"Niveli i Kontaminimit (për LOF/SVM/KNN): {CONTAMINATION_LEVEL * 100}%")
print(f"Pragu i Z-Score: {Z_SCORE_THRESHOLD}")

# Ruajmë rezultatet në një DataFrame të përkohshëm
results = pd.DataFrame(index=df.index)

# 1. Z-Score
results['Outlier_ZScore'] = detect_statistical_zscore(X_anomaly, threshold=Z_SCORE_THRESHOLD).astype(int)

# 2. KNN (Proximity)
# Përdorim (100 - contamination*100) për të gjetur percentile (psh 95%)
percentile_thresh = 100 * (1 - CONTAMINATION_LEVEL)
results['Outlier_KNN'] = detect_proximity_knn(X_anomaly, k=5, percentile=percentile_thresh).astype(int)

# 3. LOF (Density)
results['Outlier_LOF'] = detect_density_lof(X_anomaly, contamination=CONTAMINATION_LEVEL).astype(int)

# 4. Clustering (K-Means)
results['Outlier_KMeans'] = detect_clustering_kmeans(X_anomaly, n_clusters=5, percentile=percentile_thresh).astype(int)

# 5. SVM
results['Outlier_SVM'] = detect_svm(X_anomaly, nu=CONTAMINATION_LEVEL).astype(int)

# Totali i votave (Consensus)
results['Total_Votes'] = results.sum(axis=1)

# Shtojmë rezultatet në datasetin kryesor
df = pd.concat([df, results], axis=1)

# =============================================================================
# RAPORTI PËRMBLEDHËS
# =============================================================================

print("\n" + "="*40)
print("    REZULTATET E DETEKTIMIT")
print("="*40)
print(f"Totali i rreshtave: {len(df)}")
print("-" * 30)
print(f"Anomali sipas Z-Score: {results['Outlier_ZScore'].sum()}")
print(f"Anomali sipas KNN:     {results['Outlier_KNN'].sum()}")
print(f"Anomali sipas LOF:     {results['Outlier_LOF'].sum()}")
print(f"Anomali sipas KMeans:  {results['Outlier_KMeans'].sum()}")
print(f"Anomali sipas SVM:     {results['Outlier_SVM'].sum()}")
print("-" * 30)

# Identifikojmë anomalitë e forta (ato që 3 ose më shumë modele bien dakord)
strong_outliers = df[df['Total_Votes'] >= 3]
print(f"\n>>> ANOMALI TË FORTA (Të konfirmuara nga >= 3 metoda): {len(strong_outliers)}")

# Shfaqim disa shembuj të anomalive të forta
if not strong_outliers.empty:
    print("\nShembull i anomalive të detektuara (Top 3):")
    cols_to_show = ['daily_screen_time_hours', 'income_usd', 'stress_level', 'Total_Votes']
    # Sigurohemi që kolonat ekzistojnë para se t'i printojmë
    cols_to_show = [c for c in cols_to_show if c in df.columns]
    print(strong_outliers[cols_to_show].head(3))
else:
    print("Nuk u gjetën anomali të forta me këta parametra.")

# =============================================================================
# VIZUALIZIMI I PËRGJITHSHËM (Të gjitha metodat në një plot)
# =============================================================================
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_anomaly)

plt.figure(figsize=(12, 8))
# Pikat normale
plt.scatter(X_pca[df['Total_Votes'] < 3, 0], X_pca[df['Total_Votes'] < 3, 1], 
            c='lightgray', label='Normal', alpha=0.5, s=20)
# Pikat anomali (të ngjyrosura sipas numrit të votave)
sc = plt.scatter(X_pca[df['Total_Votes'] >= 3, 0], X_pca[df['Total_Votes'] >= 3, 1], 
            c=df.loc[df['Total_Votes'] >= 3, 'Total_Votes'], 
            cmap='viridis', label='Anomali (3+ methods)', s=50, edgecolor='k')
plt.colorbar(sc, label='Numri i Metodave që e quajnë Outlier')
plt.title('Vizualizimi i Përgjithshëm i Anomalive (Konsensusi)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()

In [None]:
# Zgjidhen vetëm kolonat numerike nga dataseti
from sklearn.ensemble import IsolationForest


numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
numeric_data = df[numeric_columns]

# Inicializimi i Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['Anomali'] = iso_forest.fit_predict(numeric_data)

# Ndarja e anomalive dhe i të dhënave normale për vizualizim
normal_data = df[df['Anomali'] == 1]
anomalies = df[df['Anomali'] == -1]

sns.set_theme(style='whitegrid')

# Vizualizimi i Pair Plot
plt.figure(figsize=(12, 8))
sns.pairplot(df, hue='Anomali', palette={1: 'blue', -1: 'red'}, diag_kind='kde', markers=["o", "s"])
plt.suptitle('Pair Plot per detektimin e anomalive', y=1.02)
plt.show()

# Vizualizimi shtesë i scatter plot për secilën kolone
plt.figure(figsize=(12, 8))
for col in numeric_columns:
    plt.scatter(normal_data.index, normal_data[col], label='Normal', color='b', alpha=0.5, s=20)
    plt.scatter(anomalies.index, anomalies[col], label='Anomali', color='r', alpha=0.5, s=20)

plt.legend(['Normal', 'Anomali'])
plt.title('Detektimi i anomalive në dataset:')
plt.xlabel('Index')
plt.ylabel('Vlera')
plt.show()

In [None]:
from sklearn.cluster import DBSCAN

numerical_columns = [
    'age',
    'income_usd',
    'daily_screen_time_hours',
    'phone_unlocks_per_day',
    'social_media_usage_hours',
    'gaming_usage_hours',
    'streaming_usage_hours',
    'messaging_usage_hours',
    'work_related_usage_hours',
    'sleep_hours',
    'physical_activity_hours',
    'mental_health_score',
    'depression_score',
    'anxiety_score',
    'stress_level',
    'time_spent_with_family_hours',
    'online_shopping_hours',
    'monthly_data_usage_gb',
    'age_first_phone',
    'push_notifications_per_day',
    'tech_savviness_score',
    'relationship_status_encoded',
    'urban_or_rural_encoded',
    'self_reported_addiction_level_encoded',
    'gender_encoded',
    'Total_Entertainment_Hours',
    'Overall_Mental_Health_Index'
]


data_numerical = df[numerical_columns]

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numerical)

dbscan = DBSCAN(eps=0.5, min_samples=5)

clusters = dbscan.fit_predict(data_scaled)

df['Cluster'] = clusters

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 12))

for ax, column in zip(axes.flatten(), numerical_columns):
    df.boxplot(column=column, by='Cluster', ax=ax)
    ax.get_figure().suptitle('Boxplotet e karakteristikave numerike sipas klustereve DBSCAN', fontsize=16)
    ax.set_title(column)
    ax.set_xlabel('Klusteri')
    ax.set_ylabel(column)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

features = df[numerical_columns]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_features)

distances = kmeans.transform(scaled_features)
min_distance = np.min(distances, axis=1)

threshold = np.percentile(min_distance, 90)
df['Outlier'] = min_distance > threshold

df['Type'] = df['Outlier'].apply(lambda x: 'Outlier' if x else 'Normal')

plot_columns = [
    'daily_screen_time_hours',
    'social_media_usage_hours',
    'sleep_hours',
    'mental_health_score',
    'stress_level',
    'Overall_Mental_Health_Index'
]

pairplot = sns.pairplot(
    df,
    vars=plot_columns,
    hue='Type',
    palette={'Normal': 'blue', 'Outlier': 'red'},
    plot_kws={'alpha': 0.6, 's': 35},
    diag_kind='kde',
    diag_kws={'fill': True}
)

pairplot.fig.suptitle('KMeans Outlier Detection – Pairplot', y=1.02)
plt.show()


In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns


data_numeric = df[numerical_columns]

z_scores = np.abs(stats.zscore(data_numeric, nan_policy='omit'))

threshold = 3

outliers_mask = (z_scores > threshold).any(axis=1)

print("Total rows:", len(df))
print("Outliers detected:", outliers_mask.sum())
print("Rows after removal:", len(df) - outliers_mask.sum())

df_clean = df[~outliers_mask].reset_index(drop=True)

sns.set_theme(style="whitegrid")

def plot_boxplots(data, title):
    n_cols = 3
    n_rows = int(np.ceil(len(numerical_columns) / n_cols))
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*4))
    axes = axes.flatten()
    
    for i, col in enumerate(numerical_columns):
        sns.boxplot(y=data[col], ax=axes[i])
        axes[i].set_title(col)
    
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.suptitle(title, fontsize=18)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

plot_boxplots(df, "Distributimi i kolonave numerike - Para heqjes së Outliers")

plot_boxplots(df_clean, "Distributimi i kolonave numerike - Pas heqjes së Outliers")

df_outliers = df[outliers_mask]
