In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:


# Function to determine if the filename corresponds to Tiktok or Video
def get_category(filename):
    number_part = int(filename.split('_')[0])
    return 'Tiktok' if number_part % 2 != 0 else 'Video'

# Function to determine if the filename corresponds to Pre or Post exposure
def get_exposure(filename):
    return filename.split('_')[1].capitalize()

# Initialize a DataFrame to hold the data
data = []

# Directory where data files are stored
data_dir = 'data'

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the data list
                data.append({'Category': category, 'Exposure': exposure,
                             'Forward': forward, 'Backward': backward, 'Combined': combined,'id':int(filename.split('_')[0])})

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data).set_index('id').sort_values(by='id', ascending=True)
df


Unnamed: 0_level_0,Category,Exposure,Forward,Backward,Combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Tiktok,Post,6,6,12
1,Tiktok,Pre,6,5,11
2,Video,Pre,3,5,8
2,Video,Post,7,6,13
3,Tiktok,Post,5,6,11
3,Tiktok,Pre,6,4,10
4,Video,Post,7,6,13
4,Video,Pre,6,5,11
5,Tiktok,Post,8,4,12
5,Tiktok,Pre,6,4,10


In [6]:
# Initialize dictionaries to hold the pre and post data separately
pre_data = {'Tiktok': [], 'Video': []}
post_data = {'Tiktok': [], 'Video': []}

# Iterate over files in the data directory
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        category = get_category(filename)
        exposure = get_exposure(filename)
        # Read the contents of the file
        with open(os.path.join(data_dir, filename), 'r') as file:
            content = file.read().strip()
            if content:  # Check if content is not empty
                # Split the string by commas and convert each to an integer
                forward, backward, combined = [int(x) for x in content.split(',')]
                # Append the numbers to the appropriate list
                if exposure == 'Pre':
                    pre_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
                else:  # Exposure is 'Post'
                    post_data[category].append({'Forward': forward, 'Backward': backward, 'Combined': combined, 'id': int(filename.split('_')[0])})
# Calculate the difference between Post and Pre scores
diff_data = []
for category in ['Tiktok', 'Video']:
    for pre, post in zip(pre_data[category], post_data[category]):
        if pre['id']==post['id']:
            diff_data.append({
                'Category': category,
                'Pre Forward': pre['Forward'],
                'Pre Backward': pre['Backward'],
                'Pre Combined': pre['Combined'],
                'Post Forward': post['Forward'],
                'Post Backward': post['Backward'],
                'Post Combined': post['Combined'],
                'Forward Difference': post['Forward'] - pre['Forward'],
                'Backward Difference': post['Backward'] - pre['Backward'],
                'Combined Difference': post['Combined'] - pre['Combined'],
                'id':pre['id']
            })

# Convert the list of dictionaries to a DataFrame
df_diff = pd.DataFrame(diff_data)

df_diff


Unnamed: 0,Category,Pre Forward,Pre Backward,Pre Combined,Post Forward,Post Backward,Post Combined,Forward Difference,Backward Difference,Combined Difference,id
0,Video,6,4,10,7,3,10,1,-1,0,10


In [9]:
# Initialize DataFrames to hold the pre and post data separately
df_pre = pd.DataFrame(pre_data['Tiktok'] + pre_data['Video']).set_index('id')
df_post = pd.DataFrame(post_data['Tiktok'] + post_data['Video']).set_index('id')

# Merge the pre and post DataFrames on 'id'
df_diff = pd.merge(df_pre, df_post, how='inner', on='id', suffixes=('_pre', '_post'))

# Add the 'Category' column from the original df DataFrame to df_diff

# Calculate the difference between Post and Pre scores
df_diff['Forward Difference'] = df_diff['Forward_post'] - df_diff['Forward_pre']
df_diff['Backward Difference'] = df_diff['Backward_post'] - df_diff['Backward_pre']
df_diff['Combined Difference'] = df_diff['Combined_post'] - df_diff['Combined_pre']

# Reset the index to make 'id' a regular column
df_diff.reset_index(inplace=True)

# Sort the DataFrame by 'id' in ascending order
df_diff = df_diff.sort_values(by='id', ascending=True)
# Add a new column based on the parity of 'id'
df_diff['Category'] = df_diff['id'] % 2 == 0  # True for even, False for odd
df_diff['Category'] = df_diff['Category'].map({True: 'Video', False: 'TikTok'})

df_diff = df_diff.reset_index(drop=True)

# saving the dataframe
df_diff.to_csv('df_diff.csv', header=True, index=False)

# Display the DataFrame
df_diff

Unnamed: 0,id,Forward_pre,Backward_pre,Combined_pre,Forward_post,Backward_post,Combined_post,Forward Difference,Backward Difference,Combined Difference,Category
7,1,6,5,11,6,6,12,0,1,1,TikTok
16,2,3,5,8,7,6,13,4,1,5,Video
3,3,6,4,10,5,6,11,-1,2,1,TikTok
18,4,6,5,11,7,6,13,1,1,2,Video
0,5,6,4,10,8,4,12,2,0,2,TikTok
23,6,4,3,7,5,5,10,1,2,3,Video
12,7,4,7,11,6,8,14,2,1,3,TikTok
6,9,7,5,12,7,7,14,0,2,2,TikTok
19,10,6,4,10,7,3,10,1,-1,0,Video
2,11,7,6,13,7,4,11,0,-2,-2,TikTok


In [96]:
# Function to remove outliers based on IQR
def remove_outliers(data):
    Q1 = data['Combined Difference'].quantile(0.25)
    Q3 = data['Combined Difference'].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data['Combined Difference'] >= Q1 - 1.5 * IQR) & (data['Combined Difference'] <= Q3 + 1.5 * IQR)]

tiktok_data = df_diff[df_diff['Category'] == 'Tiktok']
video_data =  df_diff[df_diff['Category'] == 'Video']

# Remove outliers
tiktok_data = remove_outliers(tiktok_data)
video_data = remove_outliers(video_data)
df_diff = remove_outliers(df_diff)

## removes one outlier --> line 7 from tiktok

In [97]:
##Running Normality check

tiktok_pre_normality = stats.shapiro(tiktok_data['Pre Combined'])
tiktok_post_normality = stats.shapiro(tiktok_data['Post Combined'])
tiktok_difference = stats.shapiro(tiktok_data['Combined Difference'])
video_pre_normality = stats.shapiro(video_data['Pre Combined'])
video_post_normality = stats.shapiro(video_data['Post Combined'])
video_difference = stats.shapiro(video_data['Combined Difference'])

print(f"TikTok - Pre Normality: {tiktok_pre_normality}")
print(f"TikTok - Post Normality: {tiktok_post_normality}")
print(f"TikTok - Difference Normality: {tiktok_difference}")
print(f"Video - Pre Normality: {video_pre_normality}")
print(f"Video - Post Normality: {video_post_normality}")
print(f"Video - Difference Normality: {video_difference}")


TikTok - Pre Normality: ShapiroResult(statistic=0.8444124460220337, pvalue=0.024140695109963417)
TikTok - Post Normality: ShapiroResult(statistic=0.9006064534187317, pvalue=0.13631579279899597)
TikTok - Difference Normality: ShapiroResult(statistic=0.8928300142288208, pvalue=0.10664115101099014)
Video - Pre Normality: ShapiroResult(statistic=0.9316637516021729, pvalue=0.4279286563396454)
Video - Post Normality: ShapiroResult(statistic=0.9429026246070862, pvalue=0.5551993250846863)
Video - Difference Normality: ShapiroResult(statistic=0.9466391801834106, pvalue=0.6013633012771606)


__The normality check yielded p values larger than 0.05 so we will perform the Mann-Whitney U test to determine if there is significant difference between the distributions.__

In [98]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Backward Difference'], video_data['Backward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.0, p-value: 0.7648400798267972


In [99]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Forward Difference'], video_data['Forward Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.5, p-value: 0.787521483658513


In [100]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(tiktok_data['Combined Difference'], video_data['Combined Difference'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 66.0, p-value: 0.7684581670627361


__The p value is larger than 0.05, so there is no significant difference in the distributions of the tiktok vs video Post-Pre scores__

__Below we will examine if there is a significant difference due to the 'practice effect'. This is done by checking the distributions of Pre and Post__

In [101]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Combined'], df_diff['Post Combined'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 202.5, p-value: 0.07621307075645634


In [102]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Backward'], df_diff['Post Backward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 212.0, p-value: 0.11087785863055066


In [103]:
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_diff['Pre Forward'], df_diff['Post Forward'])

# Print Mann-Whitney U test results
print(f"Mann-Whitney U statistic: {u_stat}, p-value: {p_value}")

Mann-Whitney U statistic: 218.5, p-value: 0.14607532366043544


__The p values are larger than 0.05 so there is no significant difference in the distributions due to the practice effect__