*Student Distraction
Prediction Using
Phone Usage Data*

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("student_distraction_dataset.csv")



In [3]:
df["Unlock_Count"] = (df["Daily_Screen_Time_Hours"] * np.random.uniform(2.5, 4.5, size=len(df))).astype(int)
df["Notifications_Received"] = (df["Social_Media_Usage_Hours"] * np.random.uniform(8, 15, size=len(df))).astype(int)


In [4]:
# Relax thresholds and use OR condition instead of AND
df["Is_Distracted"] = (
    (df["Daily_Screen_Time_Hours"] > 4) |
    (df["Social_Media_Usage_Hours"] > 2) |
    (df["Productivity_App_Usage_Hours"] < 2)
).astype(int)


In [8]:
# Add 10% random noise to simulate real-life distractions
df["Is_Distracted"] = df["Is_Distracted"] | (np.random.rand(len(df)) < 0.1)


In [10]:
# Features & Target
# ==========================
X = df[[
    'Age', 'Total_App_Usage_Hours', 'Daily_Screen_Time_Hours',
    'Number_of_Apps_Used', 'Social_Media_Usage_Hours',
    'Productivity_App_Usage_Hours', 'Gaming_App_Usage_Hours',
    'Unlock_Count', 'Notifications_Received'
]]
y = df['Is_Distracted']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Model Training
# ==========================
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [14]:
# Model Evaluation
# ==========================
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.995
Confusion Matrix:
 [[  9   1]
 [  0 190]]
Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.90      0.95        10
        True       0.99      1.00      1.00       190

    accuracy                           0.99       200
   macro avg       1.00      0.95      0.97       200
weighted avg       1.00      0.99      0.99       200



In [None]:
# ==========================
# Confusion Matrix Visualization
# ==========================
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate predictions
y_pred = model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Focused (0)', 'Distracted (1)'])
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix - Model Performance")
plt.show()


In [16]:
# Feature Importance
# ==========================
importances = model.feature_importances_
features = X.columns
plt.figure(figsize=(8,6))
plt.barh(features, importances)
plt.xlabel("Feature Importance")
plt.title("What Most Affects Distraction")
plt.show()

In [2]:
# Pie chart: Focused vs Distracted
# ==========================
labels = ['Focused', 'Distracted']
sizes = [len(df[df['Is_Distracted']==0]), len(df[df['Is_Distracted']==1])]

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=['#4CAF50','#F44336'])
plt.title("Distribution of Focused vs Distracted Students")
plt.show()


NameError: name 'df' is not defined

In [14]:

# ==========================
# Scatter Plot: Screen Time vs Social Media Usage
# ==========================
plt.figure(figsize=(8,6))
plt.scatter(df['Daily_Screen_Time_Hours'], df['Social_Media_Usage_Hours'],
            c=df['Is_Distracted'], cmap='bwr', alpha=0.6)
plt.xlabel("Daily Screen Time (Hours)")
plt.ylabel("Social Media Usage (Hours)")
plt.title("Screen Time vs Social Media Usage Colored by Distraction")
plt.show()