<h3><b>Load and Merge Datasets</b></h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import sqrt

# Load the datasets
dataset1 = pd.read_csv('dataset1.csv')  # Demographic data
dataset2 = pd.read_csv('dataset2.csv')  # Screen time data
dataset3 = pd.read_csv('dataset3.csv')  # Well-being indicators

# Merge dataset1 (demographics), dataset2 (screen time), and dataset3 (well-being) on 'ID'
merged_data = pd.merge(pd.merge(dataset1, dataset2, on='ID'), dataset3, on='ID')

# Set Seaborn style for more appealing visualizations
sns.set(style="whitegrid")


Defining Screen Time Columns

In [2]:
screen_time_columns = ['C_we', 'C_wk', 'G_we', 'G_wk', 'S_we', 'S_wk', 'T_we', 'T_wk']

<h3><b>Descriptive Statistical Analysis</b></h3>

<h4><b>Investigation 1: Descriptive Statistics for Digital Screen Time</b></h4>

<b>Propose:</b> Analyze and describe the general trends in digital screen time by gender and deprivation status 


<b>Justify:</b> Understanding screen time patterns based on demographic factors like gender and deprivation status can reveal key insights into user behavior.

<h4><b>Execution and Results</b></h4>

Execution based on gender

In [None]:
# Analysis based on gender ---
print("\nAnalyzing screen time based on demographic factor: gender...")
gender_groups = merged_data.groupby('gender')[screen_time_columns].mean()

# Display mean screen time by gender
print(f"\nMean screen time by gender:")
print(gender_groups)

# Visualization 1: Bar chart of average screen time by gender
print("\nPlotting bar chart of average screen time by gender...")
gender_groups.T.plot(kind='bar', figsize=(10, 6), title="Average Screen Time by Gender")
plt.ylabel("Average Screen Time (hours)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Execution based on deprivation status

In [None]:
print("\nAnalyzing screen time based on deprivation status...")
deprivation_groups = merged_data.groupby('deprived')[screen_time_columns].mean()

# Display mean screen time by deprivation status
print(f"\nMean screen time by deprivation status:")
print(deprivation_groups)

# Visualization 2: Bar chart of average screen time by deprivation status
print("\nPlotting bar chart of average screen time by deprivation status...")
deprivation_groups.T.plot(kind='bar', figsize=(10, 6), title="Average Screen Time by Deprivation Status")
plt.ylabel("Average Screen Time (hours)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Calculate and display descriptive statistics

In [None]:
screen_time_stats = {}
for col in screen_time_columns:
    print(f"Calculating statistics for {col}:")
    
    # Mean
    mean_value = np.mean(merged_data[col])
    print(f"  Mean: {mean_value:.2f} hours")
    
    # Median
    median_value = np.median(merged_data[col])
    print(f"  Median: {median_value:.2f} hours")
    
    # Standard Deviation
    std_value = np.std(merged_data[col])
    print(f"  Standard Deviation: {std_value:.2f}")
    
    # Interquartile Range (IQR)
    iqr_value = np.percentile(merged_data[col], 75) - np.percentile(merged_data[col], 25)
    print(f"  Interquartile Range (IQR): {iqr_value:.2f}\n")
    
    # Store stats in dictionary
    screen_time_stats[col] = {
        'mean': mean_value,
        'median': median_value,
        'std': std_value,
        'iqr': iqr_value
    }

# Visualization 3: Pairplot for Screen Time
print("Creating pairplot for screen time data...")
plt.figure(figsize=(10, 6))
sns.pairplot(merged_data[screen_time_columns], diag_kind="kde", corner=True)
plt.suptitle("Pairplot of Screen Time Activities (Weekday and Weekend)", y=1.02, fontsize=16)
plt.show()
  

<h4><b>Investigation 2: Descriptive Statistics for Well-being Indicators</b></h4>

<b>Propose:</b> Investigate the trends in well-being indicators based on self-reported responses

<b>Justify:</b> Understanding well-being trends can provide insights into mental and emotional health in relation to screen time.

<h4><b>Execution and Results</b></h4>

Define well-being indicator columns


In [6]:
wellbeing_columns = ['Optm', 'Usef', 'Relx', 'Intp', 'Engs', 'Dealpr', 'Thcklr', 'Goodme', 'Clsep', 'Conf', 'Mkmind', 'Loved', 'Intthg', 'Cheer']

In [None]:
wellbeing_stats = {}
for col in wellbeing_columns:
    print(f"Calculating statistics for {col}:")
    
    # Mean
    mean_value = np.mean(merged_data[col])
    print(f"  Mean: {mean_value:.2f}")
    
    # Median
    median_value = np.median(merged_data[col])
    print(f"  Median: {median_value:.2f}")
    
    # Standard Deviation
    std_value = np.std(merged_data[col])
    print(f"  Standard Deviation: {std_value:.2f}")
    
    # Interquartile Range (IQR)
    iqr_value = np.percentile(merged_data[col], 75) - np.percentile(merged_data[col], 25)
    print(f"  Interquartile Range (IQR): {iqr_value:.2f}\n")
    
    # Store stats in dictionary
    wellbeing_stats[col] = {
        'mean': mean_value,
        'median': median_value,
        'std': std_value,
        'iqr': iqr_value
    }

# Visualization 1: Pairplot for Well-being Indicators
print("Creating pairplot for well-being indicators...")
plt.figure(figsize=(10, 6))
sns.pairplot(merged_data[wellbeing_columns], diag_kind="kde", corner=True)
plt.suptitle("Pairplot of Well-being Indicators", y=1.02, fontsize=16)
plt.show()

<h3><b>Inferential Statistical Analysis</b></h3>

<h4><b>Investigation 1: Inferential Analysis for Screen Time (T_we) vs Optimism (Optm)</b></h4>

<b>Propose:</b> Investigate the relationship between screen time on weekends and optimism levels

<b>Justify:</b> This analysis can determine if higher screen time correlates with lower optimism levels, providing insight into the impact of screen time on mental well-being.

<h4><b>Hypothesis Test</b></h4>

<b>H0:</b> There is no significant difference between the means of weekend screen time and optimism.

<b>H1:</b> There is a significant difference between the means of weekend screen time and optimism.

<b>Step 1:</b> T-Test and P-Value

In [None]:
t_stat_optm, p_value_optm = stats.ttest_ind(merged_data['T_we'], merged_data['Optm'])

<b>Step 2:</b> Z-Score Calculation

In [None]:
mean_T_we = np.mean(merged_data['T_we'])
std_T_we = np.std(merged_data['T_we'], ddof=1)
mean_Optm = np.mean(merged_data['Optm'])
std_Optm = np.std(merged_data['Optm'], ddof=1)
n = len(merged_data)

z_score_T_we_optm = (mean_T_we - mean_Optm) / (std_T_we / sqrt(n))

<b>Step 3:</b> Confidence Interval for T_we

In [None]:
confidence_level = 0.95
degrees_freedom = n - 1
confidence_interval_T_we = stats.t.interval(confidence_level, degrees_freedom, mean_T_we, std_T_we / sqrt(n))

<h3><b>Execution and Results</b></h3>

In [None]:
print(f"\nT-Test Results for Screen Time (Weekend) vs Optimism:")
print(f"T-Statistic: {t_stat_optm}, P-Value: {p_value_optm}")
print(f"Z-Score: {z_score_T_we_optm}")
print(f"95% Confidence Interval for Screen Time (Weekend): {confidence_interval_T_we}")

<h4><b>Interpretation of Hypothesis Test</b></h4>

In [None]:
if p_value_optm < 0.05:
    print("Reject the null hypothesis (H0): There is a significant difference between weekend screen time and optimism.")
else:
    print("Fail to reject the null hypothesis (H0): There is no significant difference between weekend screen time and optimism.")

<h3><b>Visualization</b></h3>

In [None]:
# Visualization: Scatter plot with regression line for T_we vs Optm
plt.figure(figsize=(8, 5))
sns.regplot(x=merged_data['T_we'], y=merged_data['Optm'], scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title("Screen Time (Weekend) vs Optimism")
plt.xlabel("Screen Time (Weekend) in Hours")
plt.ylabel("Optimism Score")
plt.grid(True)
plt.tight_layout()
plt.show()

<h4><b>Investigation 2: Inferential Analysis for Screen Time (T_we) vs Relaxation (Relx)</b></h4>

<b>Propose:</b> Analyze the relationship between weekend screen time and relaxation levels

<b>Justify:</b> Relaxation is an important component of well-being. This analysis will help determine if higher weekend screen time impacts relaxation levels.

<h4><b>Hypothesis Test</b></h4>

<b>H0:</b> There is no significant difference between the means of weekend screen time and optimism.

<b>H1:</b> There is a significant difference between the means of weekend screen time and optimism.

<b>Step 1:</b> T-Test and P-Value

In [None]:
t_stat_relx, p_value_relx = stats.ttest_ind(merged_data['T_we'], merged_data['Relx'])

<b>Step 2:</b> Z-Score Calculation

In [None]:
mean_T_we = np.mean(merged_data['T_we'])
std_T_we = np.std(merged_data['T_we'], ddof=1)
mean_Relx = np.mean(merged_data['Relx'])
std_Relx = np.std(merged_data['Relx'], ddof=1)
n = len(merged_data)

z_score_T_we_relx = (mean_T_we - mean_Relx) / (std_T_we / sqrt(n))

<b>Step 3:</b> Confidence Interval for T_we

In [None]:
confidence_level = 0.95
degrees_freedom = n - 1
confidence_interval_relx = stats.t.interval(confidence_level, degrees_freedom, mean_Relx, std_Relx / sqrt(n))

<h3><b>Execution and Results</b></h3>

In [None]:
print(f"\nT-Test Results for Screen Time (Weekend) vs Relaxation:")
print(f"T-Statistic: {t_stat_relx}, P-Value: {p_value_relx}")
print(f"Z-Score: {z_score_T_we_relx}")
print(f"95% Confidence Interval for Relaxation: {confidence_interval_relx}")

<h4><b>Interpretation of Hypothesis Test</b></h4>

In [None]:
if p_value_relx < 0.05:
    print("Reject the null hypothesis (H0): There is a significant difference between weekend screen time and relaxation.")
else:
    print("Fail to reject the null hypothesis (H0): There is no significant difference between weekend screen time and relaxation.")

<h3><b>Visualization</b></h3>

In [None]:
# Visualization: Scatter plot with regression line for T_we vs Relx
plt.figure(figsize=(8, 5))
sns.regplot(x=merged_data['T_we'], y=merged_data['Relx'], scatter_kws={"color": "green"}, line_kws={"color": "red"})
plt.title("Screen Time (Weekend) vs Relaxation")
plt.xlabel("Screen Time (Weekend) in Hours")
plt.ylabel("Relaxation Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualization: Lineplot showing Screen Time vs Relaxation trend
plt.figure(figsize=(10, 6))
sns.lineplot(x=np.arange(len(merged_data)), y=merged_data['T_we'], label="Screen Time (Weekend)", color='blue')
sns.lineplot(x=np.arange(len(merged_data)), y=merged_data['Relx'], label="Relaxation Score", color='green')
plt.title("Lineplot of Screen Time (Weekend) vs Relaxation Score", fontsize=16)
plt.xlabel("Index")
plt.ylabel("Screen Time / Relaxation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

<h3><b>Optional Analysis</b></h3>

<h4><b>Incorporating Demographic Factors</b></h4>

<b>Propose:</b> Explore screen time patterns based on gender and deprivation status

<b>Justify:</b> Demographic factors may influence screen time behaviors, and this could provide insights into behavior patterns across groups.

<h4><b>Execution and Results</b></h4>

In [None]:
# For example, we can examine screen time patterns based on gender
print("\nAnalyzing screen time based on demographic factor: gender...")
gender_groups = merged_data.groupby('gender')[screen_time_columns].mean()

# Display mean screen time by gender
print(f"\nMean screen time by gender:")
print(gender_groups)

# Visualization 1: Bar chart of average screen time by gender
print("\nPlotting bar chart of average screen time by gender...")
gender_groups.T.plot(kind='bar', figsize=(10, 6), title="Average Screen Time by Gender")
plt.ylabel("Average Screen Time (hours)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Additional Visualization: Box plot of screen time distribution by gender
print("\nPlotting box plot of screen time distribution by gender...")
plt.figure(figsize=(10, 6))
for gender in [0, 1]:
    plt.boxplot(merged_data[merged_data['gender'] == gender][screen_time_columns].values, 
                positions=np.arange(len(screen_time_columns)) + gender * 0.4, widths=0.4)
plt.title("Screen Time Distribution by Gender")
plt.xticks(np.arange(len(screen_time_columns)), screen_time_columns, rotation=45)
plt.tight_layout()
plt.show()

<h4><b>Additional Analysis</b></h4>

In [None]:
# Additional Analysis: Exploring screen time differences by deprivation status
print("\nAnalyzing screen time patterns based on deprivation status...")
deprivation_groups = merged_data.groupby('deprived')[screen_time_columns].mean()

# Display mean screen time by deprivation status
print(f"\nMean screen time by deprivation status:")
print(deprivation_groups)

<h4><b>Visualization</b></h4>

In [None]:
# Visualization: Bar chart of average screen time by deprivation status
print("\nPlotting bar chart of average screen time by deprivation status...")
deprivation_groups.T.plot(kind='bar', figsize=(10, 6), title="Average Screen Time by Deprivation Status")
plt.ylabel("Average Screen Time (hours)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()