- https://github.com/ksopyla/awesome-nlp-polish
- Sentiment: https://pypi.org/project/sentimentpl/
- Auto correct: https://github.com/filyp/autocorrect
- other: https://github.com/sdadas/polish-nlp-resources
- papers: https://homados.ipipan.waw.pl/?page_id=93

In [1]:
import pandas as pd
import numpy as np
import spacy
from sentimentpl.models import SentimentPLModel
from autocorrect import Speller
from transformers import HerbertTokenizer, RobertaModel

import re

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import unicodedata

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
import polyglot
from polyglot.text import Text, Word

In [3]:
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer#for word embedding

import gensim
from gensim.models import Word2Vec

In [4]:
nlp_core = spacy.load("pl_core_news_lg")

In [5]:
model = SentimentPLModel(from_pretrained='latest')

In [6]:
spell = Speller('pl')

In [7]:
# tokenizer_herbert = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
# model_roberta = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

In [8]:
# nlp_pl = spacy.load('pl_spacy_model') # or spacy.load('pl_spacy_model_morfeusz')

In [9]:
def clean_przyp(txt):
    if txt != txt:
        return np.nan
    
    txt_out = txt
    
    if "przyp. Demagog" in txt:
        txt_out = (txt_out
                   .replace('(','').replace(')','')
                   .replace(' – przyp. Demagog','')
                   .replace('- red.', ''))
    if "(…)" in txt:
        txt_out =  txt_out.replace('(…)','')
    if "(...)" in txt:
        txt_out =  txt_out.replace('(...)','')
    if "[" in txt:
        txt_out = txt_out.replace('[','').replace(']','')
        
    txt_out = re.sub("@[A-Za-z0-9]+","",txt_out) #Remove @ sign
    txt_out = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", txt_out) #Remove http links
    
    txt_out = unicodedata.normalize("NFKD", txt_out) #cleaning html
    
    txt_out = txt_out.replace(';', '.').replace('  ', ' ')
    
    return txt_out

In [10]:
def extract_features(txt):
    
    doc = nlp_core(txt)
    
    out_dict = {}
    
    lemmas_list = []
    tokens_list = []
    sentiments_list = []
    embeddings_list = []

    error_n = 0

    adj_n = 0
    adv_n = 0
    noun_n = 0
    ent_n = 0

    out_dict['sentiment_all'] = model(doc.text).item()
    
    for i, sent in enumerate(doc.sents):
        s = model(sent.text).item()
        sentiments_list.append(s)
    
    out_dict['sentiment_avg'] = np.mean(sentiments_list)


    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT']:
            lemmas_list.append(token.lemma_)
            tokens_list.append(token.text)
            corrected = spell(token.text)
            if corrected != token.text:
                error_n += 1

        if token.pos_ == 'ADJ': 
            adj_n += 1
        elif token.pos_ == 'ADV':
            adv_n += 1
        elif token.pos_ == 'NOUN':
            noun_n += 1

    for ent in doc.ents:
        ent_n += 1

    tokens_list = list(set(tokens_list))
    lemmas_list = list(set(lemmas_list))

    out_dict['uniq_words'] = len(tokens_list)
    out_dict['sentiment_lemm'] =  len(lemmas_list)
    out_dict['err'] =  error_n
    out_dict['net'] = ent_n
    out_dict['ADJ'] = adj_n/len(tokens_list)
    out_dict['ADV'] = adv_n/len(tokens_list)
    out_dict['NOUN'] = noun_n/len(tokens_list)
    
    return out_dict

# Demagog

In [11]:
df = pd.read_csv('../datasets/scrapped/demagog.csv', sep=';')
df.shape

(4924, 4)

In [12]:
df = df.dropna()
df = df[df['text'].str.len() > 0 ]
df.shape

(4922, 4)

In [13]:
df['text_clean'] = df['text'].apply(lambda x: clean_przyp(x))

In [14]:
df = df[df['text_clean'].str.len() > 1 ]
df.shape

(4917, 5)

In [17]:
df['raw_dict'] = df['text_clean'].progress_apply(lambda x: extract_features(x))

100%|███████████████████████████████████████| 4917/4917 [19:07<00:00,  4.28it/s]


In [18]:
df_clean = df.join( df['raw_dict'].apply(pd.Series).rename(columns={'sentiment_lemm' : 'uniq_lemm'}) ).drop('raw_dict', axis=1)

In [19]:
df_clean.to_csv('../datasets/scrapped/demagog_features.csv', sep=';', index=False)

# OKO.press

In [24]:
df_oko_raw = pd.read_csv('../datasets/oko.press/query_result.tsv', sep='\t')

print(df_oko_raw['Id wypowiedzi'].unique().shape)

df_oko = pd.merge(
    df_oko_raw.pivot(index=['Id wypowiedzi'], columns='Nazwa pola danych', values='Wartość pola danych').reset_index(),
    df_oko_raw[~df_oko_raw['Autor Wypowiedzi'].isin(['Link do hasła', 'Nazwa słupka', 'Wesprzyj nas'])] \
        [['Id wypowiedzi', 'Autor Wypowiedzi']].dropna(),
    on='Id wypowiedzi',
    how='left'
)

print(df_oko.shape)

(2869,)
(2869, 9)


In [25]:
df_oko.head()

Unnamed: 0,Id wypowiedzi,sub_date,sub_hiperlacze,sub_napis,sub_napis_autor_wypowiedzi,sub_podpis,sub_stan_zegara,sub_title_text_after,Autor Wypowiedzi
0,1069,20160511,http://www.polskieradio.pl/7/129/Artykul/16180...,Rządy Tuska to również doprowadzenie do wyzysk...,1067,"„Sygnały Dnia”, Polskie Radio",falsz,Fałsz - wycieka kilka razy mniej,Stanisław Piotrowicz
1,1172,20160511,,Absurdy i marnotrawstwo pokazuje najlepiej pro...,1026,Sejm,blisko_prawdy,"Jest moździerz, nie ma amunicji",Antoni Macierewicz
2,1180,20160511,,Przez 15 lat finansowaliście budowę korwety Ga...,1026,Sejm,blisko_prawdy,Niedokończony okręt za miliard złotych,Antoni Macierewicz
3,1261,20160516,"http://www.tokfm.pl/Tokfm/1,145400,20083911,mo...",Dzisiaj ponad 65 procent długu państwowego jes...,1257,TOK FM,falsz,Fałsz - pomylił się o 92 miliardy,Mateusz Morawiecki
4,1411,20160512,https://www.wprost.pl/kraj/10006923/Polska-jes...,"Polska jest gotowa przyjąć każdego uchodźcę, k...",1076,Wywiad dla tygodnika „Mclean’s” za: Prezydent.pl,falsz,Fałsz - Duda bez pokrycia,Andrzej Duda


In [26]:
df_oko_fin = df_oko[['sub_napis', 'sub_stan_zegara', 'Autor Wypowiedzi']]

In [27]:
df_oko_fin['text_clean'] = df_oko_fin['sub_napis'].progress_apply(lambda x: clean_przyp(x))

100%|███████████████████████████████████| 2869/2869 [00:00<00:00, 191701.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oko_fin['text_clean'] = df_oko_fin['sub_napis'].progress_apply(lambda x: clean_przyp(x))


In [28]:
df_oko_fin['raw_dict'] = df_oko_fin['text_clean'].progress_apply(lambda x: extract_features(x))

100%|███████████████████████████████████████| 2869/2869 [10:48<00:00,  4.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oko_fin['raw_dict'] = df_oko_fin['text_clean'].progress_apply(lambda x: extract_features(x))


In [29]:
df_oko_clean = df_oko_fin.join( df_oko_fin['raw_dict'].apply(pd.Series).rename(columns={'sentiment_lemm' : 'uniq_lemm'}) ).drop('raw_dict', axis=1)

In [31]:
df_oko_clean.to_csv('../datasets/oko.press/okopress_features.csv', sep=';', index=False)