In [99]:
import pandas as pd
%run ./text_preprocessing.py

df_mleo = pd.read_csv('../data/instagram/mleo/instagram_data_2024_batch-2_mario.csv')
df_apify = pd.read_csv('../data/instagram/apify/all_posts_eredivisie_players_full.csv')
df_apify_content_type = pd.read_csv('../data/instagram/apify/content_type.csv')


df_mleo['Post Created Datetime'] = pd.to_datetime(df_mleo['Post Created Date'] + ' ' + df_mleo['Post Created Time'])
df_mleo['Year'] = df_mleo['Post Created Datetime'].dt.year
df_mleo['Week'] = df_mleo['Post Created Datetime'].dt.isocalendar().week
df_mleo['Post Created Datetime'] = df_mleo['Post Created Datetime'].dt.tz_localize('UTC')

df_apify['ownerUsername'] = df_apify['ownerUsername'].str.lower()
df_mleo['User Name'] = df_mleo['User Name'].str.lower()

df_apify_content_type['timestamp'] = pd.to_datetime(df_apify_content_type['timestamp'])
df_apify['timestamp'] = pd.to_datetime(df_apify['timestamp'])
df_apify['Year'] = df_apify['timestamp'].dt.year
df_apify['Week'] = df_apify['timestamp'].dt.isocalendar().week
df_mleo['Post Created Datetime'] = pd.to_datetime(df_mleo['Post Created Datetime'])

df_apify['date'] = df_apify['timestamp'].dt.date
df_mleo['date'] = df_mleo['Post Created Datetime'].dt.date

df_apify['hour'] = df_apify['timestamp'].dt.hour
df_mleo['hour'] = df_mleo['Post Created Datetime'].dt.hour

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jonathan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jonathan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jonathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [100]:
df = df_apify.merge(df_mleo[['User Name', 'date', 'Followers at Posting', 'Total Interactions',	'Likes', 'Comments']], 
                    left_on=['ownerUsername', 'date'], 
                    right_on=['User Name', 'date'], 
                    how='left')

df = df.merge(df_apify_content_type, left_on=['ownerUsername', 'timestamp'], right_on=['ownerUsername', 'timestamp'])
df.drop_duplicates(subset=['ownerUsername', 'timestamp'], inplace=True)
df = pd.get_dummies(df, columns=['type'])
df = df.drop(columns=['User Name'])

In [101]:
df['caption'] = df['caption'].astype(str)
df['cleaned_caption'] = df['caption'].apply(deemojize)
df['caption_lang'] = df['caption'].apply(detect_language)
df['caption_preprocessed'] = preprocess_texts(df, text_column='cleaned_caption', lang_column='caption_lang')

# Categorization

In [102]:
from bertopic import BERTopic

# Load the trained model
topic_model = BERTopic.load("./model_training_on_big_data/bert_topic_model_multilingual_basic")

In [103]:
df = df.reset_index(drop=True)
topics, probs = topic_model.transform(df['caption_preprocessed'])

print(f'Probs shape: {probs.shape}')

Probs shape: (32583, 19)


In [104]:
# Create a new DataFrame for the similarity scores
if probs.size > 0:  # Ensure probs is not empty
    num_topics = probs.shape[1]
    prob_df = pd.DataFrame(probs, columns=[f'topic_{topic_id}_similarity' for topic_id in range(num_topics)])
    
    # Add 'PlayerName', 'Week', and 'Year' columns from the original DataFrame
    prob_df['ownerUsername'] = df['ownerUsername']
    prob_df['Week'] = df['Week']
    prob_df['Year'] = df['Year']
else:
    print("No probabilities found. Check the transform method output.")

In [105]:
import numpy as np

topic_cols = [col for col in prob_df.columns if 'topic_' in col]

# Combine the topic similarity scores into one array per row
prob_df['topic_similarity_array'] = prob_df[topic_cols].apply(lambda row: row.values, axis=1)

# Keep only the necessary columns
similarity_df = prob_df[['Year', 'Week', 'ownerUsername', 'topic_similarity_array']]

# Convert the 'topic_similarity_array' column to a 2D numpy array
# similarity_df['topic_similarity_array'] = similarity_df['topic_similarity_array'].apply(np.array)

# Calculate the average topic similarity array scores per week for each year
average_scores_per_week_year = similarity_df.groupby(['Week', 'Year'])['topic_similarity_array'].apply(lambda x: np.mean(np.vstack(x.values), axis=0))

In [106]:
# Map the average scores to the corresponding rows in final_df
similarity_df['average_scores'] = similarity_df.set_index(['Week', 'Year']).index.map(average_scores_per_week_year.get)

# Calculate the difference between 'topic_similarity_array' and 'average_scores'
# similarity_df['difference'] = similarity_df.apply(lambda row: row['topic_similarity_array'] - row['average_scores'], axis=1)

# df[(df['ownerUsername'] == '0ratmangoen') & (df['Week'] == 11)]

In [110]:
similarity_df.iloc[1]['average_scores']

array([4.08847614e-01, 2.77244999e-02, 1.37504836e-02, 1.20121470e-02,
       1.22208337e-02, 1.93715434e-03, 1.57876859e-02, 1.68991568e-03,
       1.74036555e-03, 1.08027390e-02, 7.58570126e-04, 7.05194896e-03,
       7.76922364e-03, 2.37669029e-03, 6.72650513e-04, 2.36965671e-04,
       1.16577648e-04, 4.38539628e-04, 1.44815513e-04])

In [111]:
# Calculate cosine similarity for each row
cosine_similarities = []
for topic_array, avg_score in zip(similarity_df['topic_similarity_array'], similarity_df['average_scores']):
    if topic_array.size and avg_score.size:  # Ensure arrays are not empty
        topic_array = topic_array.reshape(1, -1)  # Reshape to 1 row, N columns
        avg_score = avg_score.reshape(1, -1)  # Reshape to 1 row, N columns
        cosine_sim = cosine_similarity(topic_array, avg_score).item()
        cosine_similarities.append(cosine_sim)
    else:
        cosine_similarities.append(np.nan)

# Subtract 1 from each cosine similarity value
cosine_distance = [1-sim if sim is not np.nan else np.nan for sim in cosine_similarities]

# Add the calculated similarities back to the DataFrame (optional)
similarity_df['cosine_distance'] = cosine_distance


In [112]:
similarity_df

Unnamed: 0,Year,Week,ownerUsername,topic_similarity_array,average_scores,cosine_distance
0,2024,22,noano,"[0.30446264884302515, 0.010602229775699886, 0....","[0.3274152726589983, 0.022669043287150043, 0.0...",0.012793
1,2024,19,noano,"[0.9167311449792578, 0.0009866038711666813, 0....","[0.4088476144482369, 0.027724499898118576, 0.0...",0.005022
2,2024,18,noano,"[1.128089942211288e-07, 0.9431535668166356, 1....","[0.4261447762275298, 0.046599057980891, 0.0427...",0.892304
3,2024,19,noano,"[0.3281896193920579, 0.08963306182877266, 0.01...","[0.4088476144482369, 0.027724499898118576, 0.0...",0.021282
4,2024,4,noano,"[0.9297654863449322, 7.332657409632576e-12, 1....","[0.45226287042567476, 0.023060784696701385, 0....",0.004752
...,...,...,...,...,...,...
32578,2018,41,bramnuytinck,"[0.999999999434402, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[0.45619812760796374, 0.02045806975277169, 0.0...",0.007430
32579,2020,22,ramalho92,"[0.10795658646409725, 0.0038688451331822674, 0...","[0.40487904271476033, 0.022693526089088815, 0....",0.004628
32580,2018,43,jaspercillessen,"[0.35562540677589494, 0.0028454390237822964, 0...","[0.43098721724559397, 0.02046160697239818, 0.0...",0.004236
32581,2020,35,j.schendelaar,"[0.3643514281576618, 0.011508698223556253, 0.0...","[0.44235492743140054, 0.028538437795270204, 0....",0.004387


In [113]:
# Check the dataframe
# final_df[(final_df['Year'] == 2024) & (final_df['Week'] == 22)]

# final_df.to_csv('../../data/instagram/apify/bertopic_similarity_scores.csv', index=False)
type_columns = [col for col in df.columns if col.startswith('type_')]
aggregation_dict = {
    'Total Interactions': 'mean',
    'likesCount': 'mean',
    'commentsCount': 'mean',
    'Followers at Posting': 'mean',
    'caption': 'size',
    'type_Image': 'sum',
    'type_Sidecar': 'sum',
    'type_Video': 'sum',
}

agg_metrics_df = df.groupby(['ownerUsername', 'ownerFullName', 'Week', 'Year']).agg(aggregation_dict).reset_index()
agg_metrics_df.rename(columns={'caption': 'postCount'}, inplace=True)
agg_metrics_df

Unnamed: 0,ownerUsername,ownerFullName,Week,Year,Total Interactions,likesCount,commentsCount,Followers at Posting,postCount,type_Image,type_Sidecar,type_Video
0,0ratmangoen,Ragnar Oratmangoen,5,2024,61454.500000,61922.500000,989.000000,95659.000000,2,0,2,0
1,0ratmangoen,Ragnar Oratmangoen,8,2024,41875.000000,42614.000000,524.000000,108876.000000,1,1,0,0
2,0ratmangoen,Ragnar Oratmangoen,11,2024,258701.333333,259454.333333,3070.666667,152131.666667,3,1,2,0
3,0ratmangoen,Ragnar Oratmangoen,12,2024,508300.000000,498108.000000,17419.000000,182912.000000,1,0,1,0
4,0ratmangoen,Ragnar Oratmangoen,13,2024,424572.000000,678407.000000,17532.500000,554931.000000,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
23556,zzakariab,Zakaria Bakkali,51,2017,,38.000000,0.000000,,1,1,0,0
23557,zzakariab,Zakaria Bakkali,52,2020,,43.000000,0.000000,,1,1,0,0
23558,zzakariab,Zakaria Bakkali,52,2021,,78.000000,0.000000,,1,0,0,1
23559,zzakariab,Zakaria Bakkali,52,2023,,62.500000,2.000000,,2,0,1,1


In [114]:
df_metrics = agg_metrics_df.merge(similarity_df, on=['Year', 'Week', 'ownerUsername'], how='left')
df_metrics.drop_duplicates(subset=['ownerUsername', 'Week', 'Year'], inplace=True)

df_metrics.to_csv('../data/final/social_media_data_per_week_per_player.csv', index=False)