<h3><b>Load and Merge Datasets</b></h3>

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import sqrt

# Load the datasets
dataset1 = pd.read_csv('dataset1.csv')  # Demographic data
dataset2 = pd.read_csv('dataset2.csv')  # Screen time data
dataset3 = pd.read_csv('dataset3.csv')  # Well-being indicators

# Merge dataset1 (demographics), dataset2 (screen time), and dataset3 (well-being) on 'ID'
merged_data = pd.merge(pd.merge(dataset1, dataset2, on='ID'), dataset3, on='ID')

# Set Seaborn style for more appealing visualizations
sns.set(style="whitegrid")


Defining Screen Time Columns

In [13]:
screen_time_columns = ['C_we', 'C_wk', 'G_we', 'G_wk', 'S_we', 'S_wk', 'T_we', 'T_wk']

Define well-being indicator columns

In [14]:
wellbeing_columns = ['Optm', 'Usef', 'Relx', 'Intp', 'Engs', 'Dealpr', 'Thcklr', 'Goodme', 'Clsep', 'Conf', 'Mkmind', 'Loved', 'Intthg', 'Cheer']

<h3><b>Inferential Statistical Analysis</b></h3>

<h4><b>Investigation 2: Inferential Analysis for Screen Time (T_we) vs Relaxation (Relx)</b></h4>

<b>Propose:</b> Analyze the relationship between weekend screen time and relaxation levels

<b>Justify:</b> Relaxation is an important component of well-being. This analysis will help determine if higher weekend screen time impacts relaxation levels.

<b>Key Findings:</b> High screen time users report lower relaxation scores.

<h4><b>Hypothesis Test</b></h4>

<b>H0:</b> There is no significant difference between the means of weekend screen time and optimism.

<b>H1:</b> There is a significant difference between the means of weekend screen time and optimism.

<b>Step 1:</b> T-Test and P-Value

In [15]:
t_stat_relx, p_value_relx = stats.ttest_ind(merged_data['T_we'], merged_data['Relx'])

<b>Step 2:</b> Z-Score Calculation

In [16]:
mean_T_we = np.mean(merged_data['T_we'])
std_T_we = np.std(merged_data['T_we'], ddof=1)
mean_Relx = np.mean(merged_data['Relx'])
std_Relx = np.std(merged_data['Relx'], ddof=1)
n = len(merged_data)

z_score_T_we_relx = (mean_T_we - mean_Relx) / (std_T_we / sqrt(n))

<b>Step 3:</b> Confidence Interval for T_we

In [17]:
confidence_level = 0.95
degrees_freedom = n - 1
confidence_interval_relx = stats.t.interval(confidence_level, degrees_freedom, mean_Relx, std_Relx / sqrt(n))

<h3><b>Execution and Results</b></h3>

In [None]:
print(f"\nT-Test Results for Screen Time (Weekend) vs Relaxation:")
print(f"T-Statistic: {t_stat_relx}, P-Value: {p_value_relx}")
print(f"Z-Score: {z_score_T_we_relx}")
print(f"95% Confidence Interval for Relaxation: {confidence_interval_relx}")

<h4><b>Interpretation of Hypothesis Test</b></h4>

In [None]:
if p_value_relx < 0.05:
    print("Reject the null hypothesis (H0): There is a significant difference between weekend screen time and relaxation.")
else:
    print("Fail to reject the null hypothesis (H0): There is no significant difference between weekend screen time and relaxation.")

<h3><b>Visualization</b></h3>

In [None]:
# Visualization: Scatter plot with regression line for T_we vs Relx
plt.figure(figsize=(8, 5))
sns.regplot(x=merged_data['T_we'], y=merged_data['Relx'], scatter_kws={"color": "green"}, line_kws={"color": "red"})
plt.title("Screen Time (Weekend) vs Relaxation")
plt.xlabel("Screen Time (Weekend) in Hours")
plt.ylabel("Relaxation Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualization: Lineplot showing Screen Time vs Relaxation trend
plt.figure(figsize=(10, 6))
sns.lineplot(x=np.arange(len(merged_data)), y=merged_data['T_we'], label="Screen Time (Weekend)", color='blue')
sns.lineplot(x=np.arange(len(merged_data)), y=merged_data['Relx'], label="Relaxation Score", color='green')
plt.title("Lineplot of Screen Time (Weekend) vs Relaxation Score", fontsize=16)
plt.xlabel("Index")
plt.ylabel("Screen Time / Relaxation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

<h3><b>Optional Analysis</b></h3>

<h4><b>Incorporating Demographic Factors</b></h4>

<b>Propose:</b> Explore screen time patterns based on gender and deprivation status

<b>Justify:</b> Demographic factors may influence screen time behaviors, and this could provide insights into behavior patterns across groups.

<h4><b>Execution and Results</b></h4>

In [None]:
# For example, we can examine screen time patterns based on gender
print("\nAnalyzing screen time based on demographic factor: gender...")
gender_groups = merged_data.groupby('gender')[screen_time_columns].mean()

# Display mean screen time by gender
print(f"\nMean screen time by gender:")
print(gender_groups)

<h3><b>Visualization</b></h3>

In [None]:
# Visualization 1: Bar chart of average screen time by gender
print("\nPlotting bar chart of average screen time by gender...")
gender_groups.T.plot(kind='bar', figsize=(10, 6), title="Average Screen Time by Gender")
plt.ylabel("Average Screen Time (hours)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Additional Visualization: Box plot of screen time distribution by gender
print("\nPlotting box plot of screen time distribution by gender...")
plt.figure(figsize=(10, 6))
for gender in [0, 1]:
    plt.boxplot(merged_data[merged_data['gender'] == gender][screen_time_columns].values, 
                positions=np.arange(len(screen_time_columns)) + gender * 0.4, widths=0.4)
plt.title("Screen Time Distribution by Gender")
plt.xticks(np.arange(len(screen_time_columns)), screen_time_columns, rotation=45)
plt.tight_layout()
plt.show()