In [None]:
import pandas as pd
import spacy
from tqdm import tqdm

# Disable all unused parts to boost the calculation
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'lemmatizer', 'tok2vec', 'attribute_ruler', 'senter'])

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
%%time
user_features = {}
for row in tqdm(df.iterrows(), desc='Evaluating Named Entities', total=len(df)):
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    
    user_features[user_id] = {'all': {}}
    
    for doc_id, text, date, sub_reddit, labels in documents:
        
        doc = nlp(text)
        
        user_features[user_id][doc_id] = {'tokens': len(doc)}
        for token in doc:
            if token.pos_ not in user_features[user_id][doc_id]:
                user_features[user_id][doc_id][token.pos_] = 0
            user_features[user_id][doc_id][token.pos_] += 1
            
        for pos in user_features[user_id][doc_id]:
            if pos not in user_features[user_id]['all']:
                user_features[user_id]['all'][pos] = 0
            user_features[user_id]['all'][pos] += user_features[user_id][doc_id][pos]

user_features

In [None]:
import pickle

with open('../data/linguistic_features.pickel', 'wb') as f:
    pickle.dump(user_features, f)