In [None]:
import pickle
import pandas as pd
from tqdm import tqdm

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
with open('../data/linguistic_features.pickel', 'rb') as f:
    embeddings = pickle.load(f)

In [None]:
temporal_embeddings = {}

for row in tqdm(df.iterrows(), desc='Creating Temporal Splits', total=len(df)):
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    
    temporal_embeddings[user_id] = {}
    
    for doc_id, text, date, sub_reddit, labels in documents:
        
        if date.year < 2020 or (date.year == 2021 and date.month > 4):
            continue
        
        month = str(date.year) + '-' + str(date.month)
        
        if month not in temporal_embeddings[user_id]:
            temporal_embeddings[user_id][month] = {}
        for i in embeddings[user_id][doc_id]:
            if i not in temporal_embeddings[user_id][month]:
                temporal_embeddings[user_id][month][i] = 0
            temporal_embeddings[user_id][month][i] += embeddings[user_id][doc_id][i]

In [None]:
temporal_vectors = {}

features = ['DET', 'NOUN', 'SCONJ', 'AUX', 'PART', 'VERB', 'PRON', 'ADJ', 'PUNCT',
            'ADP', 'PROPN', 'NUM', 'CCONJ', 'ADV', 'SPACE', 'SYM', 'INTJ', 'X']

for user in temporal_embeddings:
    
    temporal_vectors[user] = {}
    
    for month in temporal_embeddings[user]:
    
        # normalization
        N = temporal_embeddings[user][month]['tokens']

        # preparing vector; 0 as default value
        vector = [0]*len(features)

        for ind, f in enumerate(features):
            if f in temporal_embeddings[user][month]:
                vector[ind] = temporal_embeddings[user][month][f]/N
        temporal_vectors[user][month] = vector

In [None]:
with open('../data/linguistic_features_temporal.pickel', 'wb') as f:
    pickle.dump(temporal_vectors, f)