<a href="https://colab.research.google.com/github/MichalKucko/NLP/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr  
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, SimpleRNN, GRU
from gensim.models import FastText
from gensim.utils import tokenize
from sklearn.decomposition import PCA
from sklearn import svm
import matplotlib.pyplot as plt
import regex as re
from google.colab import files

Using TensorFlow backend.


In [4]:
# ściągnięcie i rozpakowanie wektorów FastText
!curl -o cc.pl.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
!gunzip cc.pl.300.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4294M  100 4294M    0     0  38.9M      0  0:01:50  0:01:50 --:--:-- 26.4M


In [0]:
# parsowanie za pomocą UDPipe
! wget https://github.com/ufal/udpipe/releases/download/v1.2.0/udpipe-1.2.0-bin.zip
! unzip udpipe-1.2.0-bin.zip
! rm udpipe-1.2.0-bin.zip

In [0]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':udpipe-1.2.0-bin/bin-linux64/'

In [0]:
! wget http://mozart.ipipan.waw.pl/~alina/Polish_dependency_parsing_models/190423_PDBUD_ttp_embedd.udpipe

In [0]:
! udpipe --tokenize --outfile=train_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe training_set_clean_only_text.txt

In [0]:
! udpipe --tag --parse --outfile=train_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe train_tokenised.conllu

In [0]:
! udpipe --tokenize --outfile=test_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe test_set_clean_only_text.txt

In [0]:
! udpipe --tag --parse --outfile=test_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe test_tokenised.conllu

In [8]:
# załadowanie plików z danymi
uploaded = files.upload()

Saving slownikWydzwieku01.csv to slownikWydzwieku01.csv
Saving test_set_clean_only_tags.txt to test_set_clean_only_tags.txt
Saving test_set_clean_only_text.txt to test_set_clean_only_text.txt
Saving test_udpipe_parsed.conllu to test_udpipe_parsed.conllu
Saving train_udpipe_parsed.conllu to train_udpipe_parsed.conllu
Saving training_set_clean_only_tags.txt to training_set_clean_only_tags.txt
Saving training_set_clean_only_text.txt to training_set_clean_only_text.txt


In [0]:
# wczytanie danych
with open('training_set_clean_only_text.txt', 'r') as f:
  train_text = f.readlines()
with open('test_set_clean_only_text.txt', 'r') as f:
  test_text = f.readlines()
train_tags = np.loadtxt('training_set_clean_only_tags.txt', dtype=int)
test_tags = np.loadtxt('test_set_clean_only_tags.txt', dtype=int)

# wczytanie wyników parsowania
with open('train_udpipe_parsed.conllu', 'r') as f:
  train_parsed = f.readlines()
with open('test_udpipe_parsed.conllu', 'r') as f:
  test_parsed = f.readlines()
  
# wczytanie słownika wydźwięku
sentiment = pd.read_csv("slownikWydzwieku01.csv",sep="\t",header=None)
sentimentDict = sentiment.set_index(0).T.to_dict('list')

In [0]:
# baseline - losowanie klasy na podstawie ich częstości w zbiorze uczącym
freq0 = len(train_tags[train_tags==0])/len(train_tags)
print('Odsetek próbek klasy 0:', freq0)
out = ['0\n' if np.random.uniform() < freq0 else '1\n' for i in range(len(test_text))]
with open('resultsBaseline.txt', 'w') as f:
  f.writelines(out)
files.download('resultsBaseline.txt')

Odsetek próbek klasy 0: 0.915247485310228


In [0]:
# wczytanie modelu wektorów FastText
vecModel = FastText.load_fasttext_format('cc.pl.300')

In [0]:
#funkcja lematyzująca na podstawie banku drzew

lemmas={}

for el in train_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]
for el in test_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]

def lemma(w):
  if lemmas[w]=='_':
    if w[-2:] in ['em','eś']:
      third_pers=w[:-2]
    elif w[-1] in ['m','ś']:
      third_pers=w[:-1]
    elif w[-3:]=='śmy':
      third_pers=w[:-3]
    elif w[-4:]=='ście':
      third_pers=w[:-3]
    else:
      third_pers=w
    return lemmas[third_pers]
  return lemmas[w]

In [0]:
# dla każdego przykładu zwraca, ile było słów spoza słownika (pomocnicza funkcja poglądowa niewykorzystywana w uczeniu modeli)
def getMistakeCnts(text):
  cntVec = np.zeros(len(text))
  for i in range(len(text)):
    #tokens = list(tokenize(text[i], lowercase = True)) 
    tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):
        continue
      if (token not in vecModel.wv.vocab):
        cnt += 1
    cntVec[i] = cnt
  return cntVec
  
mistakesVec_train = getMistakeCnts(train_text)
print(sum(mistakesVec_train != 0), '/', len(train_text), 'zdań z błędami na zbiorze uczącym')
print(pearsonr(mistakesVec_train, train_tags))   # korelacja raczej nieduża
mistakesVec_test = getMistakeCnts(test_text)

In [0]:
# zamiana zdań na wektory (średnia z wektorów słów)
# lemmatize - zamienia słowa na lematy
# addMistakesCnt - dodaje do wektorów liczbę słów w zdaniu, których nie było w słowniku
# sentimentCols - kolumny ze słownika wydźwięku dodane do wektorów (kolumny z wydźwiękiem: 1-4)
# mistakeMult, sentimentMult - mnożniki dla addMistakesCnt i sentimentCols, żeby bardziej się liczyły
def text2Vectors(text, embeddingDim = 300, lemmatize = False, addMistakesCnt = False, sentimentCols = (2,), mistakeMult = 10, sentimentMult = 10):
  vecLen = embeddingDim
  if addMistakesCnt: vecLen += 1
  vecLen += len(sentimentCols)
  sentVecs = np.zeros((len(text), vecLen))
  for i in range(len(text)):
    #tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')   # usuwa znaki z filters
    embVec = np.zeros(embeddingDim)
    sentimentVec = np.zeros(len(sentimentCols))
    cnt = 0
    misCnt = 0
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):
        continue
      if (addMistakesCnt and token not in vecModel.wv.vocab):
        misCnt += 1
      try:
        tokLemma = lemma(token)
        if lemmatize: token = tokLemma
        if sentimentCols: sentimentWord = [sentimentDict[tokLemma][col] for col in sentimentCols]
      except KeyError:
        if sentimentCols: sentimentWord = np.zeros(len(sentimentCols))
      try:      
        embVec += vecModel.wv[token]
        if sentimentCols: sentimentVec += sentimentWord
        cnt += 1
      except KeyError:
        continue
    if not(cnt):
      continue
    vec = embVec / cnt 
    if addMistakesCnt: vec = np.append(vec, mistakeMult * misCnt)
    if sentimentCols: vec = np.append(vec, sentimentMult * sentimentVec / cnt)
    sentVecs[i,:] = vec
  return sentVecs

sentVecsTrain = text2Vectors(train_text)
sentVecsTest = text2Vectors(test_text)

In [0]:
# PCA na wektorach
embeddingDim = 300
newEmbeddingDim = 100
pcaModel = PCA(n_components=newEmbeddingDim)
newEmbTrain = pcaModel.fit_transform(sentVecsTrain[:,:embeddingDim])
pcaSentVecsTrain = np.concatenate((newEmbTrain, sentVecsTrain[:,embeddingDim:]), axis=1)
newEmbTest = pcaModel.transform(sentVecsTest[:,:embeddingDim])
pcaSentVecsTest = np.concatenate((newEmbTest, sentVecsTest[:,embeddingDim:]), axis=1)

In [0]:
# bag of words
vectorizer = CountVectorizer()
bags_train = vectorizer.fit_transform(train_text)
bags_test = vectorizer.transform(test_text)

In [0]:
# SVM na wektorach albo bags of wordsach
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 10000, gamma='auto')
svmModel.fit(sentVecsTrain, train_tags)
#svmModel.fit(pcaSentVecsTrain, train_tags)
#svmModel.fit(bags_train, train_tags)
preds = svmModel.predict(sentVecsTest)
#preds = svmModel.predict(pcaSentVecsTest)
#preds = svmModel.predict(bags_test)
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# czyszczenie tekstu z @anonymized_account i URL-i dla sieci LSTM
train_text_clean = []
for line in train_text:
  train_text_clean.append(re.sub('@anonymized_account|http\S+\s', '', line))
test_text_clean = []
for line in test_text:
  test_text_clean.append(re.sub('@anonymized_account|http\S+\s', '', line))

In [0]:
# nadpróbkowanie przykładów z mową nienawiści (do sprawdzenia zamiast albo razem z parametrem class_weight przy uczeniu modelu)
train_hate_text = np.repeat(np.array(train_text_clean)[train_tags==1], 9)
train_text_long = np.concatenate((np.array(train_text_clean)[train_tags==0], train_hate_text))
train_hate_tags = np.repeat(np.array(train_tags)[train_tags==1], 9)
train_tags_long = np.concatenate((train_tags[train_tags==0], train_hate_tags))

In [0]:
# lematyzacja całego tekstu
def lemmatizeText(text):
  text_lemma = []
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    #tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')   # usuwa znaki z filters
    line = []
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):  
        continue
      try:
        line.append(lemma(token))
      except KeyError:
        line.append(token)
    text_lemma.append(' '.join(line))
  return text_lemma

train_text_lemmas = lemmatizeText(train_text)
test_text_lemmas = lemmatizeText(test_text)

In [0]:
# sieć LSTM, która sama będzie uczyć się wektorów słów

# tokenizacja i utworzenie sekwencji indeksów tokenów
maxLen = 40   # maksymalna długość zdania (musi być stała, bo tak chce warstwa Embedding)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_clean)
#tokenizer.fit_on_texts(train_text_lemmas)
train_seqs = tokenizer.texts_to_sequences(train_text_clean)
#train_seqs = tokenizer.texts_to_sequences(train_text_lemmas)

train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

# utworzenie i nauka modelu
embeddingDim = 100   # długość wektorów
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, embeddingDim, input_length=maxLen))
#lstmModel.add(LSTM(embeddingDim, unroll=True))
lstmModel.add(Bidirectional(LSTM(embeddingDim, unroll=True)))
#lstmModel.add(SimpleRNN(embeddingDim, unroll=True))
#lstmModel.add(GRU(embeddingDim, unroll=True))
#lstmModel.add(Dropout(0.4)) 
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9})

In [0]:
# zwraca macierz wektorów używaną przez sieci LSTM
# addMistakes - dodaje do wektorów 1, jeśli słowa nie było w słowniku i 0, jeśli było
# sentimentCols - kolumny ze słownika wydźwięku dodane do wektorów (kolumny z wydźwiękiem: 1-4)
# mistakeMult, sentimentMult - mnożniki dla addMistakes i sentimentCols, żeby bardziej się liczyły
def getVectorMatrix(tokenizer, embeddingDim = 300, addMistakes = False, sentimentCols = (), mistakeMult = 1, sentimentMult = 100):
  vecLen = embeddingDim
  if addMistakes: vecLen += 1
  vecLen += len(sentimentCols)
  vecMatrix = np.zeros((len(tokenizer.word_index) + 1, vecLen))
  for word, index in tokenizer.word_index.items():  
    try:
      wordLemma = lemma(word)
      if sentimentCols: sentimentVec = np.array([sentimentDict[wordLemma][col] for col in sentimentCols])
    except KeyError:
      if sentimentCols: sentimentVec = np.zeros(len(sentimentCols))      
    try:    
      vector = vecModel.wv[word]
      if addMistakes: 
        mistake = 0 if word in vecModel.wv.vocab else 1
        vector = np.append(vector, mistake * mistakeMult)
      if sentimentCols: vector = np.append(vector, sentimentVec * sentimentMult)
      vecMatrix[index] = vector
    except KeyError:
      continue
  return vecMatrix

In [0]:
# sieć LSTM z wyuczonymi wektorami słów FastText (warstwa Embedding z narzuconymi wagami niepodlegającymi uczeniu)

# tokenizacja i utworzenie sekwencji indeksów tokenów
maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_clean)
train_seqs = tokenizer.texts_to_sequences(train_text_clean)
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

# macierz wektorów słów
vecMatrix = getVectorMatrix(tokenizer)

# PCA
newEmbeddingDim = 100  
pcaModel = PCA(n_components=newEmbeddingDim)
newEmbTrain = pcaModel.fit_transform(vecMatrix[:,:embeddingDim])
pcaVecMatrix = np.concatenate((newEmbTrain, vecMatrix[:,embeddingDim:]), axis=1)

# utworzenie i nauka modelu
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, pcaVecMatrix.shape[1], input_length=maxLen, weights=[pcaVecMatrix], trainable=False))
#lstmModel.add(LSTM(newEmbeddingDim))
lstmModel.add(Bidirectional(LSTM(newEmbeddingDim, unroll=True)))
#lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9})

In [0]:
# testowanie powyższych dwóch sieci (nowe słowa, dla których nie ma wektorów, są odrzucane)
test_seqs = tokenizer.texts_to_sequences(test_text_clean)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)
preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
# źródło: https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
# liczenie miary F1 i błędu F1 do uczenia modelu

from keras import backend as K
import tensorflow as tf

def f1(y_true, y_pred):
  y_pred = K.round(y_pred)
  tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
  tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
  fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
  fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

  p = tp / (tp + fp + K.epsilon())
  r = tp / (tp + fn + K.epsilon())

  f1 = 2*p*r / (p+r+K.epsilon())
  f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
  return K.mean(f1)

def f1_loss(y_true, y_pred):  
  tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
  tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
  fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
  fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

  p = tp / (tp + fp + K.epsilon())
  r = tp / (tp + fn + K.epsilon())

  f1 = 2*p*r / (p+r+K.epsilon())
  f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
  return 1 - K.mean(f1)

In [0]:
# sieć LSTM na wektorach FastText (bez warstwy Embedding)

# tokenizacja i utworzenie sekwencji indeksów tokenów
embeddingDim = 300   # długość wektorów (taka jest w FastText)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_clean)
train_seqs = tokenizer.texts_to_sequences(train_text_clean)
maxLen = 40
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

# macierz wektorów słów
vecMatrix = getVectorMatrix(tokenizer)
print(vecMatrix.shape)

# PCA
#newEmbeddingDim = 50  
#pcaModel = PCA(n_components=newEmbeddingDim)
#newEmbTrain = pcaModel.fit_transform(vecMatrix[:,:embeddingDim])
#pcaVecMatrix = np.concatenate((newEmbTrain, vecMatrix[:,embeddingDim:]), axis=1)

# zamiana indeksów na wektory (to, co robi warstwa Embedding)
train_seqs = np.array([vecMatrix[seq,] for seq in train_seqs])
#train_seqs = np.array([pcaVecMatrix[seq,] for seq in train_seqs])

# utworzenie i nauka modelu
lstmModel = Sequential()
#lstmModel.add(LSTM(newEmbeddingDim))
#lstmModel.add(Bidirectional(LSTM(100, return_sequences=True)))
lstmModel.add(Bidirectional(LSTM(20)))
#lstmModel.add(Dropout(0.4))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1], validation_data=(test_seqs, test_tags))
#lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9}, batch_size = 200)

In [0]:
# testowanie powyższej sieci (nowe słowa dostają wektory z FastText)

# tokenizacja i utworzenie sekwencji indeksów tokenów
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_text_clean)
test_seqs = tokenizer.texts_to_sequences(test_text_clean)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)

# macierz wektorów słów
vecMatrix = getVectorMatrix(tokenizer)

# PCA
#newEmbTest = pcaModel.transform(vecMatrix[:,:embeddingDim])
#pcaVecMatrix = np.concatenate((newEmbTest, vecMatrix[:,embeddingDim:]), axis=1)

# zamiana indeksów na wektory (to, co robi warstwa Embedding)
test_seqs = np.array([vecMatrix[seq,] for seq in test_seqs])
#test_seqs = np.array([pcaVecMatrix[seq,] for seq in test_seqs])

# predykcja
preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
lstmModel.evaluate(test_seqs, test_tags, batch_size=len(test_seqs))

In [0]:
# usuwa słowa występujące rzadziej niż count_thres (w końcu to nie jest używane)
def preprocess(text, count_thres):
  tokenizer = Tokenizer(filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')
  tokenizer.fit_on_texts(text)
  words_to_remove = ['anonymized', 'account']
  words_to_remove.extend([w for w,c in tokenizer.word_counts.items() if c < count_thres])
  before_cnt = len(tokenizer.word_index)
  for w in words_to_remove:
    del tokenizer.word_index[w]
    del tokenizer.word_docs[w]
    del tokenizer.word_counts[w]
  text_seqs = tokenizer.texts_to_sequences(train_text)
  return (tokenizer, text_seqs, before_cnt)