- https://github.com/ksopyla/awesome-nlp-polish
- Sentiment: https://pypi.org/project/sentimentpl/
- Auto correct: https://github.com/filyp/autocorrect
- other: https://github.com/sdadas/polish-nlp-resources
- papers: https://homados.ipipan.waw.pl/?page_id=93

In [26]:
import pandas as pd
import numpy as np
import spacy
from sentimentpl.models import SentimentPLModel
from autocorrect import Speller
from transformers import HerbertTokenizer, RobertaModel

import re

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [27]:
import polyglot
from polyglot.text import Text, Word

In [56]:
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer#for word embedding

import gensim
from gensim.models import Word2Vec

import unicodedata

In [29]:
nlp_core = spacy.load("pl_core_news_lg")

In [30]:
model = SentimentPLModel(from_pretrained='latest')

In [31]:
spell = Speller('pl')

In [74]:
# tokenizer_herbert = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
# model_roberta = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

In [33]:
# nlp_pl = spacy.load('pl_spacy_model') # or spacy.load('pl_spacy_model_morfeusz')

In [34]:
df = pd.read_csv('../datasets/scrapped/demagog.csv', sep=';')
df.shape

(4898, 2)

In [35]:
df = df.dropna()
df = df[df['text'].str.len() > 0 ]
df.shape

(4896, 2)

In [60]:
def clean_przyp(txt):
    if txt != txt:
        return np.nan
    
    txt_out = txt
    
    if "przyp. Demagog" in txt:
        txt_out = txt_out.replace('(','').replace(')','').replace(' – przyp. Demagog','')
    if "(…)" in txt:
        txt_out =  txt_out.replace('(…)','')
    if "[" in txt:
        txt_out = txt_out.replace('[','').replace(']','')
        
    txt_out = re.sub("@[A-Za-z0-9]+","",txt_out) #Remove @ sign
    txt_out = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", txt_out) #Remove http links
    
    txt_out = unicodedata.normalize("NFKD", txt_out) #cleaning html
    
    return txt_out

In [61]:
df['text_clean'] = df['text'].apply(lambda x: clean_przyp(x))

In [62]:
test_txt = "Nienawidzę tego robić, tym bardziej że robię to codziemie. Ale przynajmniej mam łatwą pracę w Google."

doc = nlp_core(test_txt)
lemmas_list = []
tokens_list = []
sentiments_list = []
embeddings_list = []

error_n = 0

adj_n = 0
adv_n = 0
noun_n = 0
ent_n = 0

print(doc.text)
print(f'Sentiment: {model(doc.text).item()}\n')

for i, sent in enumerate(doc.sents):
    s = model(sent.text).item()
    print(f'Sentiment sentence {i}: {s}')  
    sentiments_list.append(s)
print()
print(f'AVG sentiment of sentences: {np.mean(sentiments_list)}\n')


for token in doc:
    #print(token.text, token.pos_, token.dep_, token.lemma_)
    if token.pos_ not in ['SPACE', 'PUNCT']:
        lemmas_list.append(token.lemma_)
        tokens_list.append(token.text)
        corrected = spell(token.text)
        if corrected != token.text:
            error_n += 1
    
    if token.pos_ == 'ADJ': 
        adj_n += 1
    elif token.pos_ == 'ADV':
        adv_n += 1
    elif token.pos_ == 'NOUN':
        noun_n += 1
        
for ent in doc.ents:
    #print(ent.text, ent.start_char, ent.end_char, ent.label_)
    ent_n += 1
    
tokens_list = list(set(tokens_list))
lemmas_list = list(set(lemmas_list))

print('Unique words:', len(tokens_list))
print('Unique lemmas:', len(lemmas_list), '\n')
print('Errors:', error_n)
print('Named entities:', ent_n, '\n')
print('ADJ:', adj_n, adj_n/len(tokens_list))
print('ADV:', adv_n, adv_n/len(tokens_list))
print('NOUN:', noun_n, noun_n/len(tokens_list))


Nienawidzę tego robić, tym bardziej że robię to codziemie. Ale przynajmniej mam łatwą pracę w Google.
Sentiment: -0.08739364892244339

Sentiment sentence 0: -0.26721104979515076
Sentiment sentence 1: 0.09078814089298248

AVG sentiment of sentences: -0.08821145445108414

Unique words: 16
Unique lemmas: 14 

Errors: 1
Named entities: 1 

ADJ: 1 0.0625
ADV: 2 0.125
NOUN: 2 0.125


In [63]:
def extract_features(txt):
    
    doc = nlp_core(txt)
    
    out_dict = {}
    
    lemmas_list = []
    tokens_list = []
    sentiments_list = []
    embeddings_list = []

    error_n = 0

    adj_n = 0
    adv_n = 0
    noun_n = 0
    ent_n = 0

    out_dict['sentiment_all'] = model(doc.text).item()
    
    for i, sent in enumerate(doc.sents):
        s = model(sent.text).item()
        sentiments_list.append(s)
    
    out_dict['sentiment_avg'] = np.mean(sentiments_list)


    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT']:
            lemmas_list.append(token.lemma_)
            tokens_list.append(token.text)
            corrected = spell(token.text)
            if corrected != token.text:
                error_n += 1

        if token.pos_ == 'ADJ': 
            adj_n += 1
        elif token.pos_ == 'ADV':
            adv_n += 1
        elif token.pos_ == 'NOUN':
            noun_n += 1

    for ent in doc.ents:
        ent_n += 1

    tokens_list = list(set(tokens_list))
    lemmas_list = list(set(lemmas_list))

    out_dict['uniq_words'] = len(tokens_list)
    out_dict['sentiment_lemm'] =  len(lemmas_list)
    out_dict['err'] =  error_n
    out_dict['net'] = ent_n
    out_dict['ADJ'] = adj_n/len(tokens_list)
    out_dict['ADV'] = adv_n/len(tokens_list)
    out_dict['NOUN'] = noun_n/len(tokens_list)
    
    return out_dict

In [64]:
df = df[df['text_clean'].str.len() > 1 ]
df.shape

(4891, 3)

In [65]:
df['raw_dict'] = df['text_clean'].progress_apply(lambda x: extract_features(x))

100%|███████████████████████████████████████| 4891/4891 [11:57<00:00,  6.82it/s]


In [71]:
df_clean = df.join( df['raw_dict'].apply(pd.Series).rename(columns={'sentiment_lemm' : 'uniq_lemm'}) ).drop('raw_dict', axis=1)

In [72]:
df_clean.to_csv('../datasets/scrapped/demagog_features.csv', sep=';', index=False)

In [73]:
# encoded_input = tokenizer_herbert.encode(doc.text, return_tensors="pt")
# outputs = model_roberta(encoded_input)
# embeddings = outputs['pooler_output'].detach().numpy()