<a href="https://colab.research.google.com/github/MichalKucko/NLP/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr  
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, SimpleRNN, GRU
from gensim.models import FastText
from gensim.utils import tokenize
from sklearn.decomposition import PCA
from sklearn import svm
import matplotlib.pyplot as plt
import regex as re
from google.colab import files

Using TensorFlow backend.


In [0]:
# załadowanie plików z danymi
uploaded = files.upload()

In [1]:
# ściągnięcie i rozpakowanie wektorów FastText
!curl -o cc.pl.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
!gunzip cc.pl.300.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4294M  100 4294M    0     0  38.8M      0  0:01:50  0:01:50 --:--:-- 19.9M


In [0]:
# parsowanie za pomocą UDPipe
! wget https://github.com/ufal/udpipe/releases/download/v1.2.0/udpipe-1.2.0-bin.zip
! unzip udpipe-1.2.0-bin.zip
! rm udpipe-1.2.0-bin.zip

In [0]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':udpipe-1.2.0-bin/bin-linux64/'

In [0]:
! wget http://mozart.ipipan.waw.pl/~alina/Polish_dependency_parsing_models/190423_PDBUD_ttp_embedd.udpipe

In [0]:
! udpipe --tokenize --outfile=train_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe training_set_clean_only_text.txt

In [0]:
! udpipe --tag --parse --outfile=train_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe train_tokenised.conllu

In [0]:
! udpipe --tokenize --outfile=test_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe test_set_clean_only_text.txt

In [0]:
! udpipe --tag --parse --outfile=test_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe test_tokenised.conllu

In [0]:
# wczytanie danych
with open('training_set_clean_only_text.txt', 'r') as f:
#with open('train_stems.txt', 'r') as f:
  train_text = f.readlines()
with open('test_set_clean_only_text.txt', 'r') as f:
#with open('test_stems.txt', 'r') as f:
  test_text = f.readlines()
#train_tags = np.loadtxt('training_set_clean_only_tags.txt', dtype=int)

In [0]:
# wczytanie wyników parsowania
with open('train_udpipe_parsed.conllu', 'r') as f:
  train_parsed = f.readlines()
with open('test_udpipe_parsed.conllu', 'r') as f:
  test_parsed = f.readlines()

In [0]:
# baseline - losowanie klasy na podstawie ich częstości w zbiorze uczącym
freq0 = len(train_tags[train_tags==0])/len(train_tags)
print('Odsetek próbek klasy 0:', freq0)
out = ['0\n' if np.random.uniform() < freq0 else '1\n' for i in range(len(test_text))]
with open('resultsBaseline.txt', 'w') as f:
  f.writelines(out)
files.download('resultsBaseline.txt')

Odsetek próbek klasy 0: 0.915247485310228


In [0]:
# wczytanie modelu wektorów FastText
vecModel = FastText.load_fasttext_format('cc.pl.300')

In [0]:
#funkcja lematyzująca na podstawie banku drzew

lemmas={}

for el in train_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]
for el in test_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]

def lemma(w):
  if lemmas[w]=='_':
    if w[-2:] in ["am", "aś",'em','eś']:
      third_pers=w[:-2]
    elif w[-1] in ['m','ś']:
      third_pers=w[:-1]
    elif w[-3:]=='śmy':
      third_pers=w[:-3]
    elif w[-4:]=='ście':
      third_pers=w[:-3]
    else:
      third_pers=w
    return lemmas[third_pers]
  return lemmas[w]

In [0]:
# dla każdego przykładu zwraca, ile było słów spoza słownika
def getMistakeCnts(text):
  cntVec = np.zeros(len(text))
  for i in range(len(text)):
    #tokens = list(tokenize(text[i], lowercase = True)) 
    tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):
        continue
      if (token not in vecModel.wv.vocab):
        cnt += 1
    cntVec[i] = cnt
  return cntVec
  
mistakesVec_train = getMistakeCnts(train_text)
print(sum(mistakesVec_train != 0), '/', len(train_text), 'zdań z błędami na zbiorze uczącym')
print(pearsonr(mistakesVec_train, train_tags))   # korelacja raczej nieduża
mistakesVec_test = getMistakeCnts(test_text)

In [0]:
# zamiana zdań na wektory (średnia z wektorów słów)
def text2Vectors(text, embeddingDim = 300, lemmatize = False, addMistakesnt = False):
  vecLen = embeddingDim + 1 if addMistakesnt else embeddingDim
  sentVecs = np.zeros((len(text), vecLen))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    #tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')   # usuwa znaki z filters
    vec = np.zeros(embeddingDim)
    cnt = 0
    misCnt = 0
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):  
        continue
      if (addMistakesnt and token not in vecModel.wv.vocab):
        misCnt += 1
      try:
        if lemmatize: token = lemma(token)
        vec += vecModel.wv[lemma(token)]
        cnt += 1
      except KeyError:
        continue
    sentVec = vec / cnt if cnt else np.zeros(embeddingDim)
    sentVecs[i,] = np.append(sentVec, misCnt) if addMistakesnt else sentVec
  return sentVecs

sentVecsTrain = text2Vectors(train_text, lemmatize = False, addMistakesnt = False)
sentVecsTest = text2Vectors(test_text, lemmatize = False, addMistakesnt = False)


In [0]:
# PCA na wektorach
newVecLen = 50
pcaModel = PCA(n_components=newVecLen)
pcaSentVecsTrain = pcaModel.fit_transform(sentVecsTrain)
pcaSentVecsTest = pcaModel.transform(sentVecsTest)

In [0]:
# bag of words
vectorizer = CountVectorizer()
bags_train = vectorizer.fit_transform(train_text)
bags_test = vectorizer.transform(test_text)

In [0]:
# SVM na wektorach albo bags of wordsach
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 10000, gamma='auto')
svmModel.fit(sentVecsTrain, train_tags)
#svmModel.fit(pcaSentVecsTrain, train_tags)
#svmModel.fit(bags_train, train_tags)
preds = svmModel.predict(sentVecsTest)
#preds = svmModel.predict(pcaSentVecsTest)
#preds = svmModel.predict(bags_test)
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# SVM na liczbach błędów (to tak tylko, żeby zobaczyć)
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 1000, gamma='auto')
svmModel.fit(mistakesVec_train.reshape(-1, 1), train_tags)
preds = svmModel.predict(mistakesVec_test.reshape(-1, 1))
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
train_text_clean = []
for line in train_text:
  train_text_clean.append(re.sub('@anonymized_account|http\S+\s', '', line))
test_text_clean = []
for line in test_text:
  test_text_clean.append(re.sub('@anonymized_account|http\S+\s', '', line))

In [0]:
# nadpróbkowanie przykładów z mową nienawiści (do sprawdzenia zamiast albo razem z parametrem class_weight przy uczeniu modelu)
train_hate_text = np.repeat(np.array(train_text_clean)[train_tags==1], 9)
train_text_long = np.concatenate((np.array(train_text)[train_tags==0], train_hate_text))
train_hate_tags = np.repeat(np.array(train_tags)[train_tags==1], 9)
train_tags_long = np.concatenate((train_tags[train_tags==0], train_hate_tags))

In [0]:
# usuwa słowa występujące rzadziej niż count_thres
def preprocess(text, count_thres):
  tokenizer = Tokenizer(filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')
  tokenizer.fit_on_texts(text)
  words_to_remove = ['anonymized', 'account']
  words_to_remove.extend([w for w,c in tokenizer.word_counts.items() if c < count_thres])
  before_cnt = len(tokenizer.word_index)
  for w in words_to_remove:
    del tokenizer.word_index[w]
    del tokenizer.word_docs[w]
    del tokenizer.word_counts[w]
  text_seqs = tokenizer.texts_to_sequences(train_text)
  return (tokenizer, text_seqs, before_cnt)

In [0]:
def lemmatizeText(text):
  text_lemma = []
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    #tokens = text_to_word_sequence(text[i], filters='!"#$%&()*+,-;<=>?@[\\]^_`{|}~\t\n')   # usuwa znaki z filters
    line = []
    for token in tokens:
      if token == 'anonymized_account' or re.search('http', token):  
        continue
      try:
        line.append(lemma(token))
      except KeyError:
        line.append(token)
    text_lemma.append(' '.join(line))
  return text_lemma

train_text_lemmas = lemmatizeText(train_text)
test_text_lemmas = lemmatizeText(test_text)

In [0]:
# sieć LSTM, która sama będzie uczyć się wektorów słów

maxLen = 40   # maksymalna długość zdania
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_lemmas)
train_seqs = tokenizer.texts_to_sequences(train_text_lemmas)
#tokenizer, train_seqs, before_cnt = preprocess(train_text, 5)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embeddingDim = 100   # długość wektorów
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, embeddingDim, input_length=maxLen))
#lstmModel.add(LSTM(embeddingDim, unroll=True))
lstmModel.add(Bidirectional(LSTM(embeddingDim, unroll=True)))
#lstmModel.add(SimpleRNN(embeddingDim, unroll=True))
#lstmModel.add(GRU(embeddingDim, unroll=True))
#lstmModel.add(Dropout(0.4)) 
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9})

In [0]:
# sieć LSTM z wyuczonymi wektorami słów (FastText)

maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_clean)
train_seqs = tokenizer.texts_to_sequences(train_text_clean)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 100    # długość wektorów po PCA
pcaModel = PCA(n_components=newEmbeddingDim)
embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)
#embedding_matrix_pca = embedding_matrix
 
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, newEmbeddingDim, input_length=maxLen, weights=[embedding_matrix_pca], trainable=False))
#lstmModel.add(LSTM(newEmbeddingDim))
lstmModel.add(Bidirectional(LSTM(newEmbeddingDim, unroll=True)))
#lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9})

In [0]:
# testowanie sieci (nowe słowa są odrzucane)
test_seqs = tokenizer.texts_to_sequences(test_text_lemmas)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)
preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
# sieć LSTM na wektorach FastText (bez warstwy Embedding)

#maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)
#tokenizer, train_seqs, before_cnt = preprocess(train_text, 3)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_clean)
train_seqs = tokenizer.texts_to_sequences(train_text_clean)
maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 100    # długość wektorów po PCA
#pcaModel = PCA(n_components=newEmbeddingDim)
#embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)
embedding_matrix_pca=embedding_matrix

train_seqs = np.array([embedding_matrix_pca[seq,] for seq in train_seqs])

lstmModel = Sequential()
#lstmModel.add(LSTM(newEmbeddingDim))
lstmModel.add(Bidirectional(LSTM(newEmbeddingDim, unroll=True, return_sequences=True)))
lstmModel.add(Bidirectional(LSTM(50, unroll=True)))
#lstmModel.add(Dropout(0.45))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
#lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=10, class_weight = {0:0.1, 1:0.9})

In [0]:
# testowanie sieci

#tokenizer, test_seqs, before_cnt = preprocess(test_text, 5)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_text_clean)
test_seqs = tokenizer.texts_to_sequences(test_text_clean)
maxLen = max([len(x) for x in train_seqs])
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

embedding_matrix_pca = embedding_matrix
#embedding_matrix_pca = pcaModel.transform(embedding_matrix)
test_seqs = np.array([embedding_matrix_pca[seq,] for seq in test_seqs])

preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
slownik = pd.read_csv("slownikWydzwieku01.csv",sep="\t",header=None)

In [0]:
wydzwiek=slownik.iloc[:,[0,3]]
wydzwiek.columns=["słowo","ocena"]

In [0]:
def sentiment(text):
  sentVecs = np.zeros((len(text), vecLen))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    vec = np.zeros(vecLen)
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account':  
        continue
      try:
        vec += vecModel.wv[token]
        cnt += 1
      except KeyError:
        continue
    sentVecs[i,] = vec / cnt if cnt else np.zeros(vecLen)
  return sentVecs