<a href="https://colab.research.google.com/github/MichalKucko/NLP/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr  
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from gensim.models import FastText
from gensim.utils import tokenize
from sklearn.decomposition import PCA
from sklearn import svm
import matplotlib.pyplot as plt
from google.colab import files

Using TensorFlow backend.


In [0]:
# załadowanie plików z danymi
uploaded = files.upload()

In [0]:
# ściągnięcie i rozpakowanie wektorów FastText
!curl -o cc.pl.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
!gunzip cc.pl.300.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4294M  100 4294M    0     0  39.1M      0  0:01:49  0:01:49 --:--:-- 17.7M


In [0]:
# wczytanie danych
with open('training_set_clean_only_text.txt', 'r') as f:
  train_text = f.readlines()
with open('test_set_clean_only_text.txt', 'r') as f:
  test_text = f.readlines()
train_tags = np.loadtxt('training_set_clean_only_tags.txt', dtype=int)

In [0]:
# baseline - losowanie klasy na podstawie ich częstości w zbiorze uczącym
freq0 = len(train_tags[train_tags==0])/len(train_tags)
print('Odsetek próbek klasy 0:', freq0)
out = ['0\n' if np.random.uniform() < freq0 else '1\n' for i in range(len(test_text))]
with open('resultsBaseline.txt', 'w') as f:
  f.writelines(out)
files.download('resultsBaseline.txt')

Odsetek próbek klasy 0: 0.915247485310228


In [0]:
# wczytanie modelu wektorów FastText
vecModel = FastText.load_fasttext_format('cc.pl.300')
print("zmieniłam")

zmieniłam


In [0]:
# zamiana zdań na wektory (średnia z wektorów słów)
def text2Vectors(text, vecLen = 300):
  errs=[]
  sentVecs = np.zeros((len(text), vecLen))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   # usuwa liczby i interpunkcję
    vec = np.zeros(vecLen)
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account':  
        continue
      try:
        vec += vecModel.wv[token]#wv[token] - zamienia słowo na wektor
        cnt += 1
      except KeyError:
        errs.append(token)
        continue
    sentVecs[i,] = vec / cnt if cnt else np.zeros(vecLen)
  return sentVecs, errs

sentVecsTrain, errs = text2Vectors(train_text)
#sentVecsTrain, errs = text2Vectors(test_text)
sentVecsTest, errs = text2Vectors(test_text)
sentVecsTest

array([[-0.04223325,  0.02780653, -0.03056183, ..., -0.03986559,
         0.01298455, -0.02628536],
       [-0.05724153,  0.04636027,  0.00597388, ..., -0.02015102,
         0.03325379,  0.01044905],
       [-0.02283935,  0.01472753,  0.00771505, ..., -0.0156786 ,
         0.04276353,  0.00952874],
       ...,
       [-0.01292548, -0.02146565,  0.00168017, ...,  0.04769662,
        -0.01327721, -0.00341588],
       [ 0.00940409, -0.00143855, -0.01415663, ...,  0.03220999,
         0.00415251,  0.01734362],
       [-0.01103416,  0.01442292,  0.02303123, ...,  0.01034418,
         0.05392634, -0.05866718]])

In [0]:
# PCA na wektorach
newVecLen = 50
#tsneModel = TSNE(perplexity=40, n_components=ncomponents, init='pca', metric=metric, n_iter=2500, random_state=23)  # można zobaczyć, czy TSNe lepsze od PCA
pcaModel = PCA(n_components=newVecLen)
pcaSentVecsTrain = pcaModel.fit_transform(sentVecsTrain)
pcaSentVecsTest = pcaModel.transform(sentVecsTest)

In [0]:
# bag of words
vectorizer = CountVectorizer()
bags_train = vectorizer.fit_transform(train_text)
bags_test = vectorizer.transform(test_text)

In [0]:
# SVM na wektorach albo bags of wordsach
#svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 1000, gamma='auto') # class_weight = {0:0.1, 1:0.9} - jaką wagę przykłada do poszczególnych klas
#svmModel.fit(sentVecsTrain, train_tags)
#svmModel.fit(pcaSentVecsTrain, train_tags)
#svmModel.fit(bags_train, train_tags)
preds = svmModel.predict(sentVecsTest)
#preds = svmModel.predict(pcaSentVecsTest)
#preds = svmModel.predict(bags_test)
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# dla każdego przykładu zwraca, ile było słów spoza słownika
def getMistakeCnts(text):
  cntVec = np.zeros(len(text))
  for i in range(len(text)):
    tokens = list(tokenize(text[i], lowercase = True))   #wywala liczby
    cnt = 0
    for token in tokens:
      if token == 'anonymized_account':
        continue
      if (token not in vecModel.wv.vocab):
        cnt += 1
    cntVec[i] = cnt
  return cntVec
  
mistakesVec_train = getMistakeCnts(train_text)
mistakesVec_test = getMistakeCnts(test_text)
print(sum(mistakesVec_train != 0), '/', len(train_text), 'zdań z błędami na zbiorze uczącym')
print(pearsonr(mistakesVec_train, train_tags))   # korelacji raczej nie ma

2954 / 10041 zdań z błędami na zbiorze uczącym
(0.01720474005221367, 0.08472403401794718)


In [0]:
# SVM na liczbach błędów
svmModel = svm.SVC(class_weight = {0:0.1, 1:0.9}, C = 1000, gamma='auto')
svmModel.fit(mistakesVec_train.reshape(-1, 1), train_tags)
preds = svmModel.predict(mistakesVec_test.reshape(-1, 1))
np.savetxt('resultsSVM.txt', preds, fmt='%d')
files.download('resultsSVM.txt')

In [0]:
# sieć LSTM, która sama będzie uczyć się wektorów słów

maxLen = 40   # maksymalna długość zdania
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embeddingDim = 50   # długość wektorów
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, embeddingDim, input_length=maxLen))# uczy się wektorów słów z integerów
lstmModel.add(LSTM(embeddingDim))# wlasciwa siec rekur
lstmModel.add(Dropout(0.5))   # można zobaczyć różne wartości, wywala 0.5 wyjsc z sieci, zeby sie nie przeuczala
lstmModel.add(Dense(1, activation='sigmoid'))# funkcja aktywacji
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')#cos tam optymalizuje
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=30)

In [0]:
# sieć LSTM z wyuczonymi wektorami słów (FastText)

maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 50    # długość wektorów po PCA
pcaModel = PCA(n_components=newEmbeddingDim)
embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)

#embedding_matrix_pca=embedding_matrix
    
lstmModel = Sequential()
lstmModel.add(Embedding(len(tokenizer.word_index) + 1, newEmbeddingDim, input_length=maxLen, weights=[embedding_matrix_pca], trainable=False))
lstmModel.add(LSTM(newEmbeddingDim))
lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags,epochs=10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 300)           7020900   
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dropout_3 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 301       
Total params: 7,742,401
Trainable params: 721,501
Non-trainable params: 7,020,900
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
# testowanie sieci (nowe słowa są odrzucane)
test_seqs = tokenizer.texts_to_sequences(test_text)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)
preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')

In [0]:
# sieć LSTM na wektorach FastText (bez warstwy Embedding) - robiwektory z fasstex w czesci testowej

maxLen = 40   # maksymalna długość zdania
embeddingDim = 300   # długość wektorów (taka jest w FastText)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_seqs = tokenizer.texts_to_sequences(train_text)
#maxLen = max([len(x) for x in train_seqs])
train_seqs = pad_sequences(train_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue

newEmbeddingDim = 50    # długość wektorów po PCA
pcaModel = PCA(n_components=newEmbeddingDim)
embedding_matrix_pca = pcaModel.fit_transform(embedding_matrix)

train_seqs = np.array([embedding_matrix_pca[seq,] for seq in train_seqs])
  
lstmModel = Sequential()
lstmModel.add(LSTM(newEmbeddingDim))
#lstmModel.add(Bidirectional(LSTM(newEmbeddingDim)))
lstmModel.add(Dropout(0.5))
lstmModel.add(Dense(1, activation='sigmoid'))
lstmModel.compile(optimizer='adam', loss='binary_crossentropy')
#lstmModel.summary()
history = lstmModel.fit(train_seqs, train_tags, epochs=30,class_weight = {0:0.1, 1:0.9})

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
# testowanie sieci

tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_text)
test_seqs = tokenizer.texts_to_sequences(test_text)
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=maxLen)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embeddingDim))
for word, index in tokenizer.word_index.items():
  try:
    vector = vecModel.wv[word]
    embedding_matrix[index] = vector
  except KeyError:
    continue
    
embedding_matrix_pca = pcaModel.transform(embedding_matrix)
test_seqs = np.array([embedding_matrix_pca[seq,] for seq in test_seqs])

preds = lstmModel.predict(test_seqs)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
np.savetxt('resultsLSTM.txt', preds, fmt='%d')
files.download('resultsLSTM.txt')