In [None]:
import pandas as pd
import requests
from nltk.corpus import stopwords
import nltk
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import Input
from sklearn.preprocessing import LabelEncoder
import collections
import matplotlib.pyplot as plt

# paczki do modelowania
from keras import models
from keras import layers
from keras import regularizers



In [None]:
#Funkcje pomocnicze
def get_closest(x, embeddings, topn=3):
    """
    Get the closest embeddings calculating the euclidean distance
    Parameters
    ----------
    x: np.ndarray
      Vector containing an embedding
    top_k: int, optional
      Get the top k similar embeddings
    Returns
    -------
    dict
      Dict containing the top k similar embeddings to the given x
    """
    # Stack all embeddings in a single matrix. Note: the matrix dimention will be
    # V x D where V is the vocabulary size and D is the embedding dimension
    embedding_matrix = np.array(list(embeddings.values()))
    # Using broadcasting compute distance to each embedding in our vocabulary
    distances = x - embedding_matrix
    # Comoute the magnitude of each distance
    distances = np.linalg.norm(distances, axis=1)
    # Sort distance and keep the smallest k
    min_idx = np.argsort(distances)[:topn]
    return [list(embeddings)[i] for i in min_idx]

In [None]:
#ściagnięcie danych
nltk.download('stopwords')
url = "https://raw.githubusercontent.com/ashutoshmakone/Twitter-US-Airline-Sentiment-classification/main/Dataset/Tweets.csv"
df = pd.read_csv(url, sep=',')

In [None]:
NB_WORDS = 10000  # Parametr odpowiadający za maksymalną liczbę słów w słowniku (najczęstsze)
NB_START_EPOCHS = 10  # Liczba epok podczas treningu
BATCH_SIZE = 512  # Wielkość mini-batcha
MAX_LEN = 24  # Maksymalna długość sekwencji 
GLOVE_DIM = 300  # Wymiarowość embeddinga GloVe

In [None]:
def deep_model(model, X_train, y_train, X_valid, y_valid):
    '''
    Function to train a multi-class model. The number of epochs and 
    batch_size are set by the constants at the top of the
    notebook. 
    
    Parameters:
        model : model with the chosen architecture
        X_train : training features
        y_train : training target
        X_valid : validation features
        Y_valid : validation target
    Output:
        model training history
    '''
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=2)
    return history


def eval_metric(history, metric_name):
    '''
    Function to evaluate a trained model on a chosen metric. 
    Training and validation metric are plotted in a
    line chart for each epoch.
    
    Parameters:
        history : model training history
        metric_name : loss or accuracy
    Output:
        line chart with epochs of x-axis and metric on
        y-axis
    '''
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
    '''
    Function to test the model on new data after training it
    on the full training data with the optimal number of epochs.
    
    Parameters:
        model : trained model
        X_train : training features
        y_train : training target
        X_test : test features
        y_test : test target
        epochs : optimal number of epochs
    Output:
        test accuracy and test loss
    '''
    model.fit(X_train
              , y_train
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=2)
    results = model.evaluate(X_test, y_test)
    
    return results

def remove_stopwords(input_text):
    '''
    Function to remove English stopwords from a Pandas Series.
    
    Parameters:
        input_text : text to clean
    Output:
        cleaned Pandas Series 
    '''
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 
    
def remove_mentions(input_text):
    '''
    Function to remove mentions, preceded by @, in a Pandas Series
    
    Parameters:
        input_text : text to clean
    Output:
        cleaned Pandas Series 
    '''
    return re.sub(r'@\w+', '', input_text)

In [None]:
# df.head()

In [None]:
#wybranie z danych tylko kolumny z tekstem i kolumny z sentymentem
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)

In [None]:
#PROSZĘ UZUPEŁNIĆ: Podział na próbkę treningową i testową
#wielkość probki testowej - 10%, ziarno dla losowania - 37
X_train, X_test, y_train, y_test = None
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
#ograniczenie liczby próbek ze względu na Colaba (jak puszczane lokalnie to nie trzba używać)
IDX = 1200
X_train = X_train[:IDX]
y_train = y_train[:IDX]
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [None]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
# PROSZĘ UZUPEŁNIĆ:wyuczenie powyższego tokenizera na tekście treningowym


#PROSZĘ UZUPEŁNIĆ:Inferencja wyuczonego tokenizera na tekście treningowym i testowym
X_train_seq = None
X_test_seq = None

In [None]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

In [None]:
#PROSZĘ UZUPEŁNIĆ: Padding sekwencji do wartości MAX_LEN (treningowej i testowej)
X_train_seq_trunc = None
X_test_seq_trunc = None

In [None]:
# X_test_seq_trunc[10]


In [None]:
#Zastosowanie label encodera na sentymencie i stworzenie wektorów y
le = LabelEncoder()
#PROSZĘ UZUPEŁNIĆ: wytrenowanie na treningowym y Label Encodera oraz infernecja na teście
y_train_le = None
y_test_le = None
#PROSZĘ UZUPEŁNIĆ: stworzenie y kategorycznych - treningowych i testowych
y_train_oh = None
y_test_oh = None

In [None]:
X_train.shape

## One Hot embeddings

In [None]:
from sklearn.preprocessing import OneHotEncoder
result_train = []
for seq in X_train_seq_trunc:
    result_train.append(to_categorical(seq, num_classes=NB_WORDS))
    
result_test = []
for seq in X_test_seq_trunc:
    result_test.append(to_categorical(seq, num_classes=NB_WORDS))

In [None]:
X_train_oh = np.array(result_train)
X_test_oh = np.array(result_test)
X_train_oh.shape

In [None]:
oh_model = models.Sequential()
#PROSZĘ UZUPEŁNIĆ: dodanie następujących warst do modelu:
# - warstwa wejsciowa o odpowiednim kształcie!
# - warstwa spłaszczająca
# - warstwa gęsta z odpowiednią liczbą neuronów (sprawdź y) i funkcją softmax

oh_model.summary()

In [None]:
oh_history = deep_model(oh_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh)


In [None]:
oh_history.history['accuracy'][-1]

In [None]:
oh_results = test_model(oh_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh, 3)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(oh_results[1]*100))

In [None]:
eval_metric(oh_history, 'loss')

In [None]:
eval_metric(oh_history, 'accuracy')

## Glove embeddings

In [None]:
# #Glove 6B

!curl -OL http://nlp.stanford.edu/data/glove.6B.zip -o glove.6B.zip
# #wget
# # !wget http://nlp.stanford.edu/data/glove.6B.zip


In [None]:
# !unzip -o glove.6B.zip
!unzip -o /content/glove.6B.zip

In [None]:
glove_embeddings = {}
with open('glove.6B.300d.txt') as f:
    glove_embeddings = {l.split()[0]: np.array(l.split()[1:]).astype('float32') for l in f}

In [None]:
airline_words = ['airplane', 'airline', 'flight', 'luggage', 'djfhaskdjfasdf']
for w in airline_words:
    if w in glove_embeddings.keys():
        print('Found the word {} in the dictionary'.format(w))

In [None]:
get_closest(glove_embeddings['airplane'], glove_embeddings)

In [None]:
#stworzenie macierzy mapowania pomiędzy słowami a odpowiadającymi im wektorami
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # The word_index contains a token for all words of the training data so we need to limit that
    if i < NB_WORDS:
        vect = glove_embeddings.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
glove_model = models.Sequential()
#PROSZĘ UZUPEŁNIĆ: dodanie następujących warst do modelu:
# - warstwa embedding o odpowiednim kształcie! (wykorzystaj NB_WORDS, GLOVE_DIM i MAX_LEN)
# - warstwa spłaszczająca
# - warstwa gęsta z odpowiednią liczbą neuronów (sprawdź y) i funkcją softmax

glove_model.summary()

In [None]:
#UWAGA - musimy zablokować możliwość trenowania warstwy z embeddingami glove!
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False
glove_model.summary()

In [None]:
glove_history = deep_model(glove_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh)
glove_history.history['accuracy'][-1]

In [None]:
glove_results = test_model(glove_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(glove_results[1]*100))

In [None]:
eval_metric(glove_history, 'loss')

In [None]:
eval_metric(glove_history, 'accuracy')

## Extra section: Trainable Embeddings

In [None]:
emb_model = models.Sequential()
#PROSZĘ UZUPEŁNIĆ: dodanie następujących warst do modelu:
# - warstwa embedding o odpowiednim kształcie - użyj wymiarowości wektora 8 (wykorzystaj NB_WORDS i MAX_LEN)
# - warstwa spłaszczająca
# - warstwa gęsta z odpowiednią liczbą neuronów (sprawdź y) i funkcją softmax

emb_model.summary()

In [None]:
emb_history = deep_model(emb_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh)
emb_history.history['accuracy'][-1]

In [None]:
emb_results = test_model(emb_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(emb_results[1]*100))

In [None]:
eval_metric(emb_history, 'loss')

In [None]:
eval_metric(emb_history, 'accuracy')