In [77]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np
import re
import string
from nltk import PorterStemmer, RegexpTokenizer
from sklearn.model_selection import train_test_split
import itertools

In [2]:
with open('dataset/training_set_clean_only_text.txt', 'r', encoding="utf8") as text_file:
    text_lines = text_file.readlines()
with open('dataset/training_set_clean_only_tags.txt', 'r') as tags_file:
    tags = tags_file.readlines()

In [3]:
wv_from_text = KeyedVectors.load_word2vec_format(datapath(r"C:\Users\Anna Marciniec\dataset\nkjp-forms-all-300-skipg-hs.txt.gz"), binary=False)

In [4]:
def clean_sentence(sentence):
    cleaned =  ' '.join(re.sub("(@[A-Za-z0-9]_+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+) |@anonymized_account ", " ", sentence).split())
    cleaned = cleaned.translate(str.maketrans('', '', string.punctuation))
    return cleaned.lower()

In [5]:
regexp_tokenizer = RegexpTokenizer(r'\w+')

In [51]:
def clean_tweet(tweet):
    cleaned_tweet = clean_sentence(tweet)
    new_tweet_list = []
    tokenized =  regexp_tokenizer.tokenize(cleaned_tweet.lower())
    for word in tokenized:
        if word in wv_from_text.vocab:
            new_tweet_list.append(word)
            continue
        if word.capitalize() in wv_from_text.vocab:
            new_tweet_list.append(word.capitalize())
            continue
        removed = word[:-2]
        if removed in wv_from_text.vocab:
            new_tweet_list.append(removed)
            continue
    return ' '.join(new_tweet_list)

In [52]:
cleaned_tweets = []

In [53]:
for tweet in text_lines:
    new_tweet = clean_tweet(tweet)
    cleaned_tweets.append(new_tweet)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_tweets, tags, test_size=0.25)

In [104]:
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [56]:
from keras.preprocessing.text import Tokenizer


In [57]:
token = Tokenizer()
token.fit_on_texts(cleaned_tweets)
word_index = token.word_index


In [58]:
from keras.preprocessing import sequence

In [59]:
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=70)


In [86]:
lexicon = set(itertools.chain(*list(map(lambda x :x.split(' ') ,cleaned_tweets))))

In [87]:
embedding_matrix = np.zeros((len(lexicon) + 1, 300))


In [88]:
for word, i in word_index.items():
    if word in wv_from_text.vocab:
        embedding_vector = wv_from_text.get_vector(word)
    else:
        embedding_vector = wv_from_text.get_vector(word.capitalize())

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [89]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.094872  ,  0.135253  ,  0.013441  , ..., -0.048842  ,
         0.000602  , -0.105519  ],
       [-0.06228   , -0.001151  , -0.024933  , ...,  0.052002  ,
         0.066079  ,  0.089732  ],
       ...,
       [-0.011147  ,  0.022036  ,  0.22927   , ..., -0.30707601,
         0.082493  , -0.092688  ],
       [ 0.263311  ,  0.092519  , -0.120999  , ...,  0.32495001,
        -0.145742  , -0.33339599],
       [-0.030147  , -0.15119299, -0.20597699, ...,  0.09103   ,
        -0.238083  , -0.052658  ]])

In [108]:
import keras
from keras import layers, models, optimizers, metrics
from sklearn import metrics

In [124]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, y_train, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[0.8445803]]
CNN, Word Embeddings 0.9199522102747909


In [123]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label, epochs = 10)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    print(classifier.predict(ex_1))
    return metrics.accuracy_score(predictions, y_test)

In [201]:
example  = ['Do końca świata i o jeden dzień dłużej!', 'ty małoskowy, podły bezduszny wstrętny pisiorski katolu siema']

Sources : https://twitter.com/KingaBezKorony/status/1216447484169543682, https://twitter.com/RobertBiedron/status/1216358527222108161

In [204]:
ex_1 = sequence.pad_sequences(token.texts_to_sequences(example), maxlen=70)

In [205]:
classifier.predict(ex_1)

array([[1.8819788e-05],
       [7.2009170e-01]], dtype=float32)