<a href="https://colab.research.google.com/github/MichalKucko/NLP/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr  
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from gensim.models import FastText
from gensim.utils import tokenize
from sklearn.decomposition import PCA
from sklearn import svm
import matplotlib.pyplot as plt
from google.colab import files

Using TensorFlow backend.


In [0]:
# załadowanie plików z danymi
uploaded = files.upload()

Saving test_set_clean_only_tags.txt to test_set_clean_only_tags.txt
Saving test_set_clean_only_text.txt to test_set_clean_only_text.txt
Saving training_set_clean_only_tags.txt to training_set_clean_only_tags.txt
Saving training_set_clean_only_text.txt to training_set_clean_only_text.txt


In [0]:
# ściągnięcie i rozpakowanie wektorów FastText
!curl -o cc.pl.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
!gunzip cc.pl.300.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4294M  100 4294M    0     0  29.8M      0  0:02:23  0:02:23 --:--:-- 30.1M
cc.pl.300.bin  sample_data


In [0]:
# wczytanie danych
with open('training_set_clean_only_text.txt', 'r') as f:
  train_text = f.readlines()
with open('test_set_clean_only_text.txt', 'r') as f:
  test_text = f.readlines()
train_tags = np.loadtxt('training_set_clean_only_tags.txt', dtype=int)

In [0]:
# baseline - losowanie klasy na podstawie ich częstości w zbiorze uczącym
freq0 = len(train_tags[train_tags==0])/len(train_tags)
print('Odsetek próbek klasy 0:', freq0)
out = ['0\n' if np.random.uniform() < freq0 else '1\n' for i in range(len(test_text))]
with open('resultsBaseline.txt', 'w') as f:
  f.writelines(out)
files.download('resultsBaseline.txt')

Odsetek próbek klasy 0: 0.915247485310228


In [0]:
# wczytanie modelu wektorów FastText
vecModel = FastText.load_fasttext_format('cc.pl.300')
print("zmieniłam")

In [0]:
# zamiana zdań na wektory (średnia z wektorów słów)
def text2Vectors(text, vecLen = 300):
  sentVecs = np.zeros((len(text), vecLen))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    vec = np.zeros(vecLen)
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account':  
        continue
      try:
        vec += vecModel.wv[token]
        cnt += 1
      except KeyError:
        continue
    sentVecs[i,] = vec / cnt if cnt else np.zeros(vecLen)
  return sentVecs

sentVecsTrain = text2Vectors(train_text)
sentVecsTest = text2Vectors(test_text)

In [0]:
# PCA na wektorach
newVecLen = 50
#tsneModel = TSNE(perplexity=40, n_components=ncomponents, init='pca', metric=metric, n_iter=2500, random_state=23)  # można zobaczyć, czy TSNe lepsze od PCA
pcaModel = PCA(n_components=newVecLen)
pcaSentVecsTrain = pcaModel.fit_transform(sentVecsTrain)
pcaSentVecsTest = pcaModel.transform(sentVecsTest)

In [0]:
# bag of words
vectorizer = CountVectorizer()
bags_train = vectorizer.fit_transform(train_text)
bags_test = vectorizer.transform(test_text)

In [0]:
# SVM na wektorach albo bags of wordsach
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 1000, gamma='auto')
svmModel.fit(sentVecsTrain, train_tags)
#svmModel.fit(pcaSentVecsTrain, train_tags)
#svmModel.fit(bags_train, train_tags)
preds = svmModel.predict(sentVecsTest)
#preds = svmModel.predict(pcaSentVecsTest)
#preds = svmModel.predict(bags_test)
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# dla każdego przykładu zwraca, ile było słów spoza słownika
def getMistakeCnts(text):
  cntVec = np.zeros(len(text))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   #wywala liczby
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account':
        continue
      if (token not in vecModel.wv.vocab):
        cnt += 1
    cntVec[i] = cnt
  return cntVec
  
mistakesVec_train = getMistakeCnts(train_text)
mistakesVec_test = getMistakeCnts(test_text)
print(sum(mistakesVec_train != 0), '/', len(train_text), 'zdań z błędami na zbiorze uczącym')
print(pearsonr(mistakesVec_train, train_tags))   # korelacji raczej nie ma

2954 / 10041 zdań z błędami na zbiorze uczącym
(0.01720474005221367, 0.08472403401794718)


In [0]:
# SVM na liczbach błędów
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 1000, gamma='auto')
svmModel.fit(mistakesVec_train.reshape(-1, 1), train_tags)
preds = svmModel.predict(mistakesVec_test.reshape(-1, 1))
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# sieć LSTM, która sama będzie uczyć się wektorów słów

maxLen = 40   # maksymalna długość zdania
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embeddingDim = 50   # długość wektorów
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, embeddingDim, input_length=maxLen))
lstmModel.add(LSTM(embeddingDim))
lstmModel.add(Dropout(0.5))   # można zobaczyć różne wartości
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=30)

In [0]:
# sieć LSTM z wyuczonymi wektorami słów (FastText)

maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 50    # długość wektorów po PCA
pcaModel = PCA(n_components=newEmbeddingDim)
embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)
    
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, newEmbeddingDim, input_length=maxLen, weights=[embedding_matrix_pca], trainable=False))
lstmModel.add(LSTM(newEmbeddingDim))
lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags)

In [0]:
# testowanie sieci (nowe słowa są odrzucane)
test_seqs = tokenizer.texts_to_sequences(test_text)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)
preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
# sieć LSTM na wektorach FastText (bez warstwy Embedding)

maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 50    # długość wektorów po PCA
pcaModel = PCA(n_components=newEmbeddingDim)
embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)

train_seqs = np.array([embedding_matrix_pca[seq,] for seq in train_seqs])
  
lstmModel = Sequential()
lstmModel.add(LSTM(newEmbeddingDim))
#lstmModel.add(Bidirectional(LSTM(newEmbeddingDim)))
lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
#lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=30)

In [0]:
# testowanie sieci

tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_text)
test_seqs = tokenizer.texts_to_sequences(test_text)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue
    
embedding_matrix_pca = pcaModel.transform(embedding_matrix)
test_seqs = np.array([embedding_matrix_pca[seq,] for seq in test_seqs])

preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [25]:
! wget https://github.com/ufal/udpipe/releases/download/v1.2.0/udpipe-1.2.0-bin.zip
! unzip udpipe-1.2.0-bin.zip
! rm udpipe-1.2.0-bin.zip

--2019-06-02 14:24:27--  https://github.com/ufal/udpipe/releases/download/v1.2.0/udpipe-1.2.0-bin.zip
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/50672597/a24cacd8-77c6-11e7-8f6e-e9de8ca37f48?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20190602%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190602T142427Z&X-Amz-Expires=300&X-Amz-Signature=d1da91a8dfc84f93118a9b192770e980d8745bb916d18d2a2a4f249085b15919&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dudpipe-1.2.0-bin.zip&response-content-type=application%2Foctet-stream [following]
--2019-06-02 14:24:27--  https://github-production-release-asset-2e65be.s3.amazonaws.com/50672597/a24cacd8-77c6-11e7-8f6e-e9de8ca37f48?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA

In [0]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':udpipe-1.2.0-bin/bin-linux64/'

In [27]:
! udpipe

Usage: udpipe [running_opts] model_file [input_files]
       udpipe --train [training_opts] model_file [input_files]
       udpipe --detokenize [detokenize_opts] raw_text_file [input_files]
Running opts: --accuracy (measure accuracy only)
              --input=[conllu|generic_tokenizer|horizontal|vertical]
              --immediate (process sentences immediately during loading)
              --outfile=output file template
              --output=[conllu|epe|matxin|horizontal|plaintext|vertical]
              --tokenize (perform tokenization)
              --tokenizer=tokenizer options, implies --tokenize
              --tag (perform tagging)
              --tagger=tagger options, implies --tag
              --parse (perform parsing)
              --parser=parser options, implies --parse
Training opts: --method=[morphodita_parsito] which method to use
               --heldout=heldout data file name
               --tokenizer=tokenizer options
               --tagger=tagger options
      

In [28]:
! wget http://mozart.ipipan.waw.pl/~alina/Polish_dependency_parsing_models/190423_PDBUD_ttp_embedd.udpipe

--2019-06-02 14:30:05--  http://mozart.ipipan.waw.pl/~alina/Polish_dependency_parsing_models/190423_PDBUD_ttp_embedd.udpipe
Resolving mozart.ipipan.waw.pl (mozart.ipipan.waw.pl)... 213.135.36.148
Connecting to mozart.ipipan.waw.pl (mozart.ipipan.waw.pl)|213.135.36.148|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53636990 (51M)
Saving to: ‘190423_PDBUD_ttp_embedd.udpipe.1’


2019-06-02 14:30:10 (11.6 MB/s) - ‘190423_PDBUD_ttp_embedd.udpipe.1’ saved [53636990/53636990]



In [18]:
! udpipe --tokenize --outfile=train_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe training_set_clean_only_text.txt

Loading UDPipe model: done.


In [21]:
! udpipe --tag --parse --outfile=train_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe train_tokenised.conllu

Loading UDPipe model: done.


In [22]:
! udpipe --tokenize --outfile=test_tokenised.conllu 190423_PDBUD_ttp_embedd.udpipe test_set_clean_only_text.txt

Loading UDPipe model: done.


In [23]:
! udpipe --tag --parse --outfile=test_udpipe_parsed.conllu 190423_PDBUD_ttp_embedd.udpipe test_tokenised.conllu

Loading UDPipe model: done.


In [0]:
with open('train_udpipe_parsed.conllu', 'r') as f:
  train_parsed = f.readlines()
with open('test_udpipe_parsed.conllu', 'r') as f:
  test_parsed = f.readlines()

In [0]:
 #funkcja lematyzująca na podstawie banku drzew

lemmas={}

for el in train_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]
for el in test_parsed:
  if len(el.split("\t"))>=3:
    lemmas[el.split("\t")[1]]=el.split("\t")[2]

def lemma(w):
  if lemmas[w]=='_':
    if w[-2:] in ['em','eś']:
      third_pers=w[:-2]
    elif w[-1] in ['m','ś']:
      third_pers=w[:-1]
    elif w[-3:]=='śmy':
      third_pers=w[:-3]
    elif w[-4:]=='ście':
      third_pers=w[:-3]
    else:
      third_pers=w
    return lemmas[third_pers]
  return lemmas[w]