In [38]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

data = pd.read_csv("/kaggle/input/screentime-app-details-dataset/Screentime-App-Details-Dataset.csv")
print(data.head())

         Date  Usage  Notifications  Times opened        App
0  08/26/2022     38             70            49  Instagram
1  08/27/2022     39             43            48  Instagram
2  08/28/2022     64            231            55  Instagram
3  08/29/2022     14             35            23  Instagram
4  08/30/2022      3             19             5  Instagram


In [39]:
data.isnull().sum()

Date             0
Usage            0
Notifications    0
Times opened     0
App              0
dtype: int64

In [40]:
print(data.describe())

            Usage  Notifications  Times opened
count   54.000000      54.000000     54.000000
mean    65.037037     117.703704     61.481481
std     58.317272      97.017530     43.836635
min      1.000000       8.000000      2.000000
25%     17.500000      25.750000     23.500000
50%     58.500000      99.000000     62.500000
75%     90.500000     188.250000     90.000000
max    244.000000     405.000000    192.000000


In [41]:
figure = px.bar(data_frame=data, 
                x = "Date", 
                y = "Usage", 
                color="App", 
                title="Usage")
figure.show()



In [42]:

figure = px.bar(data_frame=data, 
                x = "Date", 
                y = "Notifications", 
                color="App", 
                title="Notifications")
figure.show()



In [None]:
figure = px.bar(data_frame=data, 
                x = "Date", 
                y = "Times opened", 
                color="App",
                title="Times Opened")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, 
                    x="Notifications",
                    y="Usage", 
                    size="Notifications", 
                    trendline="ols", 
                    title = "Relationship Between Number of Notifications and Usage")
figure.show()

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# Feature Engineering (Adding day of the week as a feature)
data['Date'] = pd.to_datetime(data['Date'])
data['Day_of_week'] = data['Date'].dt.dayofweek

# Splitting the data into training and testing sets
X = data[['Day_of_week', 'Notifications', 'Times opened']]
y = data['Usage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

# Visualizing actual vs predicted screen time usage
plt.scatter(y_test, predictions)
plt.xlabel("Actual Usage")
plt.ylabel("Predicted Usage")
plt.title("Actual vs Predicted Screen Time Usage")
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

# Define thresholds to classify predicted usage into categories
thresholds = [50, 100, 150, 200]

# Discretize predicted usage into categories
predicted_classes = []
for pred in predictions:
    if pred < thresholds[0]:
        predicted_classes.append(0)  # Low usage
    elif pred < thresholds[1]:
        predicted_classes.append(1)  # Moderate usage
    elif pred < thresholds[2]:
        predicted_classes.append(2)  # High usage
    else:
        predicted_classes.append(3)  # Very high usage

# Discretize actual usage into categories
actual_classes = []
for actual in y_test:
    if actual < thresholds[0]:
        actual_classes.append(0)  # Low usage
    elif actual < thresholds[1]:
        actual_classes.append(1)  # Moderate usage
    elif actual < thresholds[2]:
        actual_classes.append(2)  # High usage
    else:
        actual_classes.append(3)  # Very high usage

# Generate confusion matrix
cm = confusion_matrix(actual_classes, predicted_classes)

# Visualize confusion matrix
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(thresholds))
plt.xticks(tick_marks, thresholds)
plt.yticks(tick_marks, thresholds)
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()


In [None]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, GridSearchCV
import shap

# Feature Importance Analysis
perm_importance = permutation_importance(model, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.title("Feature Importance Analysis")
plt.show()

# Hyperparameter Tuning
param_grid = {}  # No hyperparameters to tune for LinearRegression
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

# Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

# Model Interpretability
explainer = shap.Explainer(model.predict, X_train)  # Use a callable wrapper around model.predict
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)

# Pipeline Construction (if necessary)
# Include data preprocessing, feature engineering, model training, and evaluation steps in a pipeline



In [None]:
## Analysis and Conclusion

### Hyperparameter Tuning Results
- Best parameters: {}
- No hyperparameters were tuned for the Linear Regression model.

### Cross-Validation Scores
- The mean cross-validation score was approximately 0.54.
- Scores varied across folds:
  - Fold 1: 0.44
  - Fold 2: 0.65
  - Fold 3: 0.82
  - Fold 4: 0.23
  - Fold 5: 0.53

### Interpretation
- Variability in scores across folds indicates different performance on subsets of data.
- Mean score suggests moderate overall performance.
- Feature importance analysis and SHAP values provide insights into predictive features and their impact.

### Conclusion
- The Linear Regression model shows moderate performance without hyperparameter tuning.
- Cross-validation scores indicate variability but reasonable overall performance.
- Insights from feature importance analysis and SHAP values guide model interpretation.
- Further improvements could involve feature engineering, model selection, or data collection.
