In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [87]:


# Function to determine if the filename corresponds to Tiktok or Video
def get_category(filename):
    number_part = int(filename.split('_')[0])
    return 'Tiktok' if number_part % 2 != 0 else 'Video'

# Function to determine if the filename corresponds to Pre or Post exposure
def get_exposure(filename):
    return filename.split('_')[1].capitalize()

# Initialize a DataFrame to hold the data
data = []

# Directory where data files are stored
data_dir = 'data'

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the data list
                data.append({'Category': category, 'Exposure': exposure,
                             'Forward': forward, 'Backward': backward, 'Combined': combined,'id':int(filename.split('_')[0])})

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

df


Unnamed: 0,Category,Exposure,Forward,Backward,Combined,id
0,Video,Post,7,3,10,10
1,Video,Pre,6,4,10,10
2,Tiktok,Post,7,4,11,11
3,Tiktok,Pre,7,6,13,11
4,Tiktok,Post,5,4,9,13
5,Tiktok,Pre,5,5,10,13
6,Video,Post,9,6,15,14
7,Video,Pre,8,7,15,14
8,Tiktok,Post,6,5,11,15
9,Tiktok,Pre,7,4,11,15


In [95]:
# Initialize dictionaries to hold the pre and post data separately
pre_data = {'Tiktok': [], 'Video': []}
post_data = {'Tiktok': [], 'Video': []}

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the appropriate list
                if exposure == 'Pre':
                    pre_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
                else:  # Exposure is 'Post'
                    post_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
# Calculate the difference between Post and Pre scores
diff_data = []
for category in ['Tiktok', 'Video']:
    for pre, post in zip(pre_data[category], post_data[category]):
        if pre['id']==post['id']:
            diff_data.append({
                'Category': category,
                'Pre Forward': pre['Forward'],
                'Pre Backward': pre['Backward'],
                'Pre Combined': pre['Combined'],
                'Post Forward': post['Forward'],
                'Post Backward': post['Backward'],
                'Post Combined': post['Combined'],
                'Forward Difference': post['Forward'] - pre['Forward'],
                'Backward Difference': post['Backward'] - pre['Backward'],
                'Combined Difference': post['Combined'] - pre['Combined'],
                'id':pre['id']
            })

# Convert the list of dictionaries to a DataFrame
df_diff = pd.DataFrame(diff_data)
df_diff

Unnamed: 0,Category,Pre Forward,Pre Backward,Pre Combined,Post Forward,Post Backward,Post Combined,Forward Difference,Backward Difference,Combined Difference,id
0,Tiktok,7,6,13,7,4,11,0,-2,-2,11
1,Tiktok,5,5,10,5,4,9,0,-1,-1,13
2,Tiktok,7,4,11,6,5,11,-1,1,0,15
3,Tiktok,5,8,13,8,8,16,3,0,3,17
4,Tiktok,6,5,11,6,6,12,0,1,1,1
5,Tiktok,8,8,16,9,10,19,1,2,3,21
6,Tiktok,5,5,10,5,5,10,0,0,0,23
7,Tiktok,9,8,17,7,0,7,-2,-8,-10,25
8,Tiktok,6,4,10,5,6,11,-1,2,1,3
9,Tiktok,5,5,10,6,7,13,1,2,3,51


In [96]:
# Function to remove outliers based on IQR
def remove_outliers(data):
    Q1 = data['Combined Difference'].quantile(0.25)
    Q3 = data['Combined Difference'].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data['Combined Difference'] >= Q1 - 1.5 * IQR) & (data['Combined Difference'] <= Q3 + 1.5 * IQR)]

tiktok_data = df_diff[df_diff['Category'] == 'Tiktok']
video_data =  df_diff[df_diff['Category'] == 'Video']

# Remove outliers
tiktok_data = remove_outliers(tiktok_data)
video_data = remove_outliers(video_data)
df_diff = remove_outliers(df_diff)

## removes one outlier --> line 7 from tiktok

In [97]:
##Running Normality check

tiktok_pre_normality = stats.shapiro(tiktok_data['Pre Combined'])
tiktok_post_normality = stats.shapiro(tiktok_data['Post Combined'])
tiktok_difference = stats.shapiro(tiktok_data['Combined Difference'])
video_pre_normality = stats.shapiro(video_data['Pre Combined'])
video_post_normality = stats.shapiro(video_data['Post Combined'])
video_difference = stats.shapiro(video_data['Combined Difference'])

print(f"TikTok - Pre Normality: {tiktok_pre_normality}")
print(f"TikTok - Post Normality: {tiktok_post_normality}")
print(f"TikTok - Difference Normality: {tiktok_difference}")
print(f"Video - Pre Normality: {video_pre_normality}")
print(f"Video - Post Normality: {video_post_normality}")
print(f"Video - Difference Normality: {video_difference}")


TikTok - Pre Normality: ShapiroResult(statistic=0.8444124460220337, pvalue=0.024140695109963417)
TikTok - Post Normality: ShapiroResult(statistic=0.9006064534187317, pvalue=0.13631579279899597)
TikTok - Difference Normality: ShapiroResult(statistic=0.8928300142288208, pvalue=0.10664115101099014)
Video - Pre Normality: ShapiroResult(statistic=0.9316637516021729, pvalue=0.4279286563396454)
Video - Post Normality: ShapiroResult(statistic=0.9429026246070862, pvalue=0.5551993250846863)
Video - Difference Normality: ShapiroResult(statistic=0.9466391801834106, pvalue=0.6013633012771606)


__The normality check yielded p values larger than 0.05 so we will perform the Mann-Whitney U test to determine if there is significant difference between the distributions.__

In [98]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Backward Difference'], video_data['Backward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.0, p-value: 0.7648400798267972


In [99]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Forward Difference'], video_data['Forward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.5, p-value: 0.787521483658513


In [100]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Combined Difference'], video_data['Combined Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.0, p-value: 0.7684581670627361


__The p value is larger than 0.05, so there is no significant difference in the distributions of the tiktok vs video Post-Pre scores__

__Below we will examine if there is a significant difference due to the 'practice effect'. This is done by checking the distributions of Pre and Post__

In [101]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Combined'], df_diff['Post Combined'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 202.5, p-value: 0.07621307075645634


In [102]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Backward'], df_diff['Post Backward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 212.0, p-value: 0.11087785863055066


In [103]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Forward'], df_diff['Post Forward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 218.5, p-value: 0.14607532366043544


__The p values are larger than 0.05 so there is no significant difference in the distributions due to the practice effect__