In [None]:
import numpy as np
import pandas as pd

import collections
import re
import nltk

from tensorflow.keras.layers import Dense, SpatialDropout1D, Conv1D, Embedding, GlobalMaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
train_df  = pd.read_table('/content/UMICH_SI650_train_data.txt', names=['sentiment', 'comment'], header=None, delimiter="\t", quoting=3)

In [None]:
train_df.head()

Unnamed: 0,sentiment,comment
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [None]:
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

INPUT_FILE = '/content/UMICH_SI650_train_data.txt'

maxlen = 0
counter = collections.Counter()

with open(file=INPUT_FILE, mode='r', encoding='utf-8') as fin:
    stop_words = set(nltk.corpus.stopwords.words('english'))
    for line in fin:
        label, sentence = line.strip().split('\t') # убирает пробелы и разделяет на две части предложение в файле, построчно
        sentence = re.sub("[^a-zA-Z]", " ", sentence) # ^начинается и заканчивается (любая буква a-z или A-Z в начале строки)
        words = [token.lower() for token in nltk.word_tokenize(sentence) if not token.lower() in stop_words]

        if len(words) > maxlen: # поиск макс строки со словами
            maxlen = len(words)
            if maxlen == 22:
              print(words)
        for word in words:
            counter[word] += 1

model = Word2Vec(words, vector_size=100, window=5, min_count=1)
embeddings = np.array([np.mean([model.wv[word] for word in sentence], axis=0) for sentence in words])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['harry', 'potter', 'dragged', 'draco', 'malfoy', 'trousers', 'past', 'hips', 'sucked', 'throat', 'vigor', 'making', 'whimpering', 'noises', 'panting', 'groaning', 'around', 'blonds', 'rock', 'hard', 'aching', 'cock']


In [None]:
word2index = collections.defaultdict(int) # возвращает новый словарь-подобный объект.
VOCAB_SIZE = (train_df.shape[0]) # 7086

# часто встречающиеся
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1 # ключ-значение

vocab_sz = len(word2index) + 1 # 1993
index2word = {v:k for k, v in word2index.items()} # поменяли ключ-значение

print(word2index)
print(index2word)

defaultdict(<class 'int'>, {'harry': 1, 'potter': 2, 'code': 3, 'da': 4, 'vinci': 5, 'brokeback': 6, 'mountain': 7, 'love': 8, 'awesome': 9, 'mission': 10, 'impossible': 11, 'like': 12, 'movie': 13, 'sucks': 14, 'sucked': 15, 'hate': 16, 'much': 17, 'really': 18, 'movies': 19, 'stupid': 20, 'one': 21, 'know': 22, 'suck': 23, 'loved': 24, 'want': 25, 'right': 26, 'think': 27, 'depressing': 28, 'people': 29, 'would': 30, 'reading': 31, 'horrible': 32, 'series': 33, 'fucking': 34, 'terrible': 35, 'yeah': 36, 'story': 37, 'oh': 38, 'left': 39, 'ok': 40, 'guy': 41, 'start': 42, 'felicia': 43, 'book': 44, 'beautiful': 45, 'good': 46, 'also': 47, 'went': 48, 'read': 49, 'saw': 50, 'first': 51, 'tom': 52, 'thought': 53, 'liked': 54, 'way': 55, 'absolutely': 56, 'well': 57, 'still': 58, 'time': 59, 'got': 60, 'big': 61, 'film': 62, 'heard': 63, 'ever': 64, 'better': 65, 'watch': 66, 'going': 67, 'great': 68, 'seen': 69, 'things': 70, 'said': 71, 'gay': 72, 'last': 73, 'boring': 74, 'watching': 

In [None]:
xs, ys = [], []

with open(file=INPUT_FILE, mode='r', encoding='utf-8') as fin:
    stop_words = set(nltk.corpus.stopwords.words('english'))
    for line in fin:
        label, sentence = line.strip().split('\t')
        sentence = re.sub("[^a-zA-Z]", " ", sentence)
        ys.append(int(label))

        words = [token.lower() for token in nltk.word_tokenize(sentence) if not token.lower() in stop_words]

        wids = [word2index[word] for word in words]
        xs.append(wids)

In [None]:
from tensorflow.keras.utils import to_categorical

X = pad_sequences(xs, maxlen=maxlen) # преобразует список в двумерный тензор (матрицу).
                                     # Длина преобразованного тензора определяется параметром maxlen
y = to_categorical(ys)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(5668, 22)

In [None]:
X_train

array([[   0,    0,    0, ...,   24,    6,    7],
       [   0,    0,    0, ...,   14,   40,  181],
       [   0,    0,    0, ..., 1679,  456, 1680],
       ...,
       [   0,    0,    0, ...,    2,    8,   19],
       [   0,    0,    0, ...,  186,   20,  119],
       [   0,    0,    0, ...,    5,    3,   62]], dtype=int32)

In [None]:
EMBED_SIZE  = 100
NUM_FILTERS = 256
NUM_WORDS   = 3

model = Sequential()
model.add(Embedding(input_length = maxlen, output_dim = EMBED_SIZE, input_dim = vocab_sz))
model.add(SpatialDropout1D(0.1))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size = NUM_WORDS, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 100)           199400    
                                                                 
 spatial_dropout1d (Spatial  (None, 22, 100)           0         
 Dropout1D)                                                      
                                                                 
 conv1d (Conv1D)             (None, 22, 256)           77056     
                                                                 
 global_max_pooling1d (Glob  (None, 256)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 2)                 514       
                                                                 
Total params: 276970 (1.06 MB)
Trainable params: 276970 

In [None]:
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
 15/178 [=>............................] - ETA: 1:16 - loss: 0.6416 - accuracy: 0.5688

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

y_test_new = np.argmax(y_test, axis=1)
y_pred_new = np.argmax(y_pred, axis=1)
#
accuracy = accuracy_score(y_test_new, y_pred_new)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test_new, y_pred_new))
print(confusion_matrix(y_test_new, y_pred_new))