In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np

In [2]:
# Function to determine if the filename corresponds to Tiktok or Video
def get_category(filename):
    number_part = int(filename.split('_')[0])
    return 'Tiktok' if number_part % 2 != 0 else 'Video'

# Function to determine if the filename corresponds to Pre or Post exposure
def get_exposure(filename):
    return filename.split('_')[1].capitalize()

# Initialize a DataFrame to hold the data
data = []

# Directory where data files are stored
data_dir = 'data'

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the data list
                data.append({'Category': category, 'Exposure': exposure,
                             'Forward': forward, 'Backward': backward, 'Combined': combined,'id':int(filename.split('_')[0])})

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Category,Exposure,Forward,Backward,Combined,id
0,Video,Post,7,3,10,10
1,Video,Pre,6,4,10,10
2,Tiktok,Post,7,4,11,11
3,Tiktok,Pre,7,6,13,11
4,Tiktok,Post,5,4,9,13
5,Tiktok,Pre,5,5,10,13
6,Video,Post,9,6,15,14
7,Video,Pre,8,7,15,14
8,Tiktok,Post,6,5,11,15
9,Tiktok,Pre,7,4,11,15


In [3]:
# Initialize dictionaries to hold the pre and post data separately
pre_data = {'Tiktok': [], 'Video': []}
post_data = {'Tiktok': [], 'Video': []}

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the appropriate list
                if exposure == 'Pre':
                    pre_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
                else:  # Exposure is 'Post'
                    post_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
# Calculate the difference between Post and Pre scores
diff_data = []
for category in ['Tiktok', 'Video']:
    for pre, post in zip(pre_data[category], post_data[category]):
        if pre['id']==post['id']:
            diff_data.append({
                'Category': category,
                'Pre Forward': pre['Forward'],
                'Pre Backward': pre['Backward'],
                'Pre Combined': pre['Combined'],
                'Post Forward': post['Forward'],
                'Post Backward': post['Backward'],
                'Post Combined': post['Combined'],
                'Forward Difference': post['Forward'] - pre['Forward'],
                'Backward Difference': post['Backward'] - pre['Backward'],
                'Combined Difference': post['Combined'] - pre['Combined'],
                'id':pre['id']
            })

# Convert the list of dictionaries to a DataFrame
df_diff = pd.DataFrame(diff_data)
df_diff
df_with_outlier=df_diff
df_with_outlier

Unnamed: 0,Category,Pre Forward,Pre Backward,Pre Combined,Post Forward,Post Backward,Post Combined,Forward Difference,Backward Difference,Combined Difference,id
0,Tiktok,7,6,13,7,4,11,0,-2,-2,11
1,Tiktok,5,5,10,5,4,9,0,-1,-1,13
2,Tiktok,7,4,11,6,5,11,-1,1,0,15
3,Tiktok,5,8,13,8,8,16,3,0,3,17
4,Tiktok,6,5,11,6,6,12,0,1,1,1
5,Tiktok,8,8,16,9,10,19,1,2,3,21
6,Tiktok,5,5,10,5,5,10,0,0,0,23
7,Tiktok,9,8,17,7,0,7,-2,-8,-10,25
8,Tiktok,6,4,10,5,6,11,-1,2,1,3
9,Tiktok,5,5,10,6,7,13,1,2,3,51


In [5]:
## WE WILL NOT USE THIS FUNCTION, SINCE WE CHOSE NOT TO REMOVE OUTLIERS
# Function to remove outliers based on IQR
def remove_outliers(data):
    Q1 = data['Combined Difference'].quantile(0.25)
    Q3 = data['Combined Difference'].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data['Combined Difference'] >= Q1 - 1.5 * IQR) & (data['Combined Difference'] <= Q3 + 1.5 * IQR)]

tiktok_data = df_diff[df_diff['Category'] == 'Tiktok']
video_data =  df_diff[df_diff['Category'] == 'Video']

In [28]:
##Running Normality check

tiktok_pre_normality = stats.shapiro(tiktok_data['Pre Combined'])
tiktok_post_normality = stats.shapiro(tiktok_data['Post Combined'])
tiktok_difference = stats.shapiro(tiktok_data['Combined Difference'])
video_pre_normality = stats.shapiro(video_data['Pre Combined'])
video_post_normality = stats.shapiro(video_data['Post Combined'])
video_difference = stats.shapiro(video_data['Combined Difference'])
all_participant_pre = stats.shapiro(df_diff['Pre Combined'])
all_participant_post = stats.shapiro(df_diff['Post Combined'])
all_participant_pre_fo = stats.shapiro(df_diff['Pre Forward'])
all_participant_post_fo = stats.shapiro(df_diff['Post Forward'])
all_participant_pre_ba = stats.shapiro(df_diff['Pre Backward'])
all_participant_post_ba = stats.shapiro(df_diff['Post Backward'])


print(f"TikTok - Pre  combined Normality: {tiktok_pre_normality}")
print(f"TikTok - Post  combined Normality: {tiktok_post_normality}")
print(f"TikTok - Difference Normality: {tiktok_difference}")
print(f"Video - Pre combined Normality: {video_pre_normality}")
print(f"Video - Post combined Normality: {video_post_normality}")
print(f"Video - Difference Normality: {video_difference}")
print(f"All Prticipants pre test combined Normality: {all_participant_pre}")
print(f"All Prticipants post test combined Normality: {all_participant_post}")
print(f"All Prticipants pre test forward Normality: {all_participant_pre_fo}")
print(f"All Prticipants post test forward Normality: {all_participant_post_fo}")
print(f"All Prticipants pre test backward Normality: {all_participant_pre_ba}")
print(f"All Prticipants post test backward Normality: {all_participant_post_ba}")


TikTok - Pre  combined Normality: ShapiroResult(statistic=0.8291043639183044, pvalue=0.011699498631060123)
TikTok - Post  combined Normality: ShapiroResult(statistic=0.9514713883399963, pvalue=0.583775520324707)
TikTok - Difference Normality: ShapiroResult(statistic=0.7075890302658081, pvalue=0.0004432534915395081)
Video - Pre combined Normality: ShapiroResult(statistic=0.9316637516021729, pvalue=0.4279286563396454)
Video - Post combined Normality: ShapiroResult(statistic=0.9429026246070862, pvalue=0.5551993250846863)
Video - Difference Normality: ShapiroResult(statistic=0.9466391801834106, pvalue=0.6013633012771606)
All Prticipants pre test combined Normality: ShapiroResult(statistic=0.9413565993309021, pvalue=0.15909923613071442)
All Prticipants post test combined Normality: ShapiroResult(statistic=0.9628242254257202, pvalue=0.4735436737537384)
All Prticipants pre test forward Normality: ShapiroResult(statistic=0.9649890065193176, pvalue=0.5224411487579346)
All Prticipants post test 

The normality check yielded one p value smaller than 0.05 so we will perform the Mann-Whitney U test __to the tiktok vs video differences__ to determine if there is significant difference between the distributions

In [9]:
##appendix
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Backward Difference'], video_data['Backward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.0, p-value: 0.5543111254001669


In [10]:
#appendix
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Forward Difference'], video_data['Forward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 67.0, p-value: 0.5921995253203276


In [17]:
# Mann-Whitney U test for combined differences (IN THE REPORT)
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Combined Difference'], video_data['Combined Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")
print(f"Tiktok mean: {np.mean(tiktok_data['Combined Difference'])}, Control mean: {np.mean(video_data['Combined Difference'])}")
print(f"Tiktok Standard Dev: {np.std(tiktok_data['Combined Difference'])}, Control Standard Dev: {np.std(video_data['Combined Difference'])}")

Mann-Whitney U statistic: 66.0, p-value: 0.559849572824591
Tiktok mean: 0.5, Control mean: 1.6363636363636365
Tiktok Standard Dev: 3.2895939827627014, Control Standard Dev: 1.9198829165402616


__The p value is larger than 0.05, so there is no significant difference in the distributions of the tiktok vs video Post-Pre scores__

Below we will examine if there is a significant difference due to the 'practice effect'. This is done by checking the distributions of Pre and Post with the t test because above it passed the normality check

In [20]:
# T test 
t_stat, p_value = stats.ttest_rel(df_diff['Pre Combined'], df_diff['Post Combined'])


print(f"T- statistic: {t_stat}, p-value: {p_value}")
print(f"All participants pre test mean: {np.mean(df_diff['Pre Combined'])}, All participants post test mean: {np.mean(df_diff['Post Combined'])}")
print(f"All participants pre test standard dev: {np.std(df_diff['Pre Combined'])}, All participants post test standard dev: {np.std(df_diff['Post Combined'])}")


T- statistic: -1.732050807568877, p-value: 0.09609990712790022
All participants pre test mean: 11.56, All participants post test mean: 12.56
All participants pre test standard dev: 2.43441984875247, All participants post test standard dev: 2.926841300788275


In [26]:

u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Backward'], df_diff['Post Backward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 257.0, p-value: 0.275685688480367


In [27]:

u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Forward'], df_diff['Post Forward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 248.0, p-value: 0.20484631360808192


__The p values are larger than 0.05 so there is no significant difference in the distributions due to the practice effect__

__Now we will perform a statistical analysis on the questionnaire answers. Specifically we want to see if the people who spend more than 2 hours on social media have a different distribution on their Pre-Test results, than the people who spend less than 2 hours. 
We will follow the same logic as above__

In [40]:
##Importing the data
df1 = pd.read_csv('Post Test Questionnaire (Responses) - Form Responses 1.csv')
# Removing timestamp
df1=df1.drop(columns=['Timestamp'])
df1=df1.rename(columns={"Input your test ID:": "id"})

In [41]:
df1

Unnamed: 0,1. How much time do you spend on social media daily?,2. Which platforms do you usually use?,3. Do you feel that social media affects your concentration?,id
0,more than 2 hours,"Youtube, Instagram, Facebook",Yes,15
1,more than 2 hours,"Youtube, Instagram",Maybe,5
2,more than 2 hours,"TikTok, Youtube, Instagram, X(Twitter), Facebo...",Maybe,18
3,more than 2 hours,"TikTok, Youtube, Instagram, Facebook",Yes,6
4,more than 2 hours,"TikTok, Youtube, Instagram, X(Twitter)",Yes,2
5,1-2 hours,"Instagram, X(Twitter)",Yes,1
6,more than 2 hours,"TikTok, Instagram, Facebook, Snapchat",Yes,13
7,more than 2 hours,"Youtube, Instagram, Facebook",Yes,3
8,more than 2 hours,"Youtube, Reddit",Maybe,17
9,1-2 hours,"X(Twitter), Facebook",Yes,16


In [55]:
#merging the dataframes based on id
df_merged= pd.merge(df1,df_with_outlier, how='inner', on=['id'])
df_merged=df_merged.drop(columns=['Category','Post Forward','Post Backward','Post Combined','Forward Difference','Backward Difference','Combined Difference'])

In [56]:
df_merged

Unnamed: 0,1. How much time do you spend on social media daily?,2. Which platforms do you usually use?,3. Do you feel that social media affects your concentration?,id,Pre Forward,Pre Backward,Pre Combined
0,more than 2 hours,"Youtube, Instagram, Facebook",Yes,15,7,4,11
1,more than 2 hours,"Youtube, Instagram",Maybe,5,6,4,10
2,more than 2 hours,"TikTok, Youtube, Instagram, X(Twitter), Facebo...",Maybe,18,7,6,13
3,more than 2 hours,"TikTok, Youtube, Instagram, Facebook",Yes,6,4,3,7
4,more than 2 hours,"TikTok, Youtube, Instagram, X(Twitter)",Yes,2,3,5,8
5,1-2 hours,"Instagram, X(Twitter)",Yes,1,6,5,11
6,more than 2 hours,"TikTok, Instagram, Facebook, Snapchat",Yes,13,5,5,10
7,more than 2 hours,"Youtube, Instagram, Facebook",Yes,3,6,4,10
8,more than 2 hours,"Youtube, Reddit",Maybe,17,5,8,13
9,1-2 hours,"X(Twitter), Facebook",Yes,16,5,5,10


In [70]:
more2_data = df_merged[df_merged['1. How much time do you spend on social media daily?'] == 'more than 2 hours']
less2_data =  df_merged[df_merged['1. How much time do you spend on social media daily?'] !='more than 2 hours']


In [71]:
# Function to remove outliers based on IQR
def remove_outliers1(data):
    Q1 = data['Pre Combined'].quantile(0.25)
    Q3 = data['Pre Combined'].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data['Pre Combined'] >= Q1 - 1.5 * IQR) & (data['Pre Combined'] <= Q3 + 1.5 * IQR)]
print(len(less2_data))
less2_data = remove_outliers1(less2_data)
print(len(less2_data))
print(len(more2_data))
more2_data = remove_outliers1(more2_data)
len(more2_data)

13
13
12


12

__No outliers in the 'Pre Combined' Category__

In [72]:
##Running Normality check

more2_pre_normality = stats.shapiro(more2_data['Pre Combined'])
less2_pre_normality = stats.shapiro(less2_data['Pre Combined'])

print(f"More than 2 hours - Pre Normality: {more2_pre_normality}")
print(f"Less than 2 hours - Pre Normality: {less2_pre_normality}")

More than 2 hours - Pre Normality: ShapiroResult(statistic=0.9308353662490845, pvalue=0.38905051350593567)
Less than 2 hours - Pre Normality: ShapiroResult(statistic=0.8585206270217896, pvalue=0.03683377057313919)


__The normality check yielded one p value smaller than 0.05 so we will perform the Mann-Whitney U test to determine if there is significant difference between the distributions.__

In [73]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(more2_data['Pre Combined'], less2_data['Pre Combined'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 65.0, p-value: 0.48763852298832355


__The p value is larger than 0.05 so there is no significant difference in the distributions due to the dialy time spent on social media__