In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [20]:
df=pd.read_csv("IMDB Dataset.csv")

In [21]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [104]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [22]:
from nltk import word_tokenize

In [23]:
from nltk.corpus import stopwords
import string
import re
stop_words=set(stopwords.words('english'))

In [24]:

def clean_text(text):
    text=str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text) 
    tokens = word_tokenize(str(text).lower(), language='english') 
    tokens = [word for word in tokens if word not in stop_words ]
    return tokens


In [25]:
%%time
df['review']=df['review'].apply(clean_text)

CPU times: user 23.7 s, sys: 137 ms, total: 23.9 s
Wall time: 23.9 s


In [26]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

In [27]:
df_train

Unnamed: 0,review,sentiment
47808,"[caught, little, gem, totally, accident, back,...",positive
20154,"[believe, let, movie, accomplish, favor, frien...",negative
43069,"[spoiler, alert, gets, nerve, people, remake, ...",negative
19413,"[one, thing, learnt, watching, george, romero,...",negative
13673,"[remember, theaters, reviews, said, horrible, ...",negative
...,...,...
31092,"[man, named, walt, disney, mission, satisfy, f...",positive
22917,"[first, time, saw, shades, sneakpreview, even,...",negative
47481,"[waste, time, danger, watch, tempted, tear, dv...",negative
35597,"[far, pathetic, movie, indian, cinema, cinema,...",negative


In [28]:
from gensim.models import Word2Vec

In [29]:

model=Word2Vec(sentences=df_train['review'],vector_size=150, window=5, min_count=5, sg=1,negative=10,              
    epochs=10,
    workers=10)

In [31]:
%time

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs


In [32]:
print(model)

Word2Vec<vocab=35582, vector_size=150, alpha=0.025>


In [33]:
vector = model.wv['bad']   
print(vector)


[ 0.08061524 -0.10841177  0.04403988 -0.0519916   0.14087656  0.06184833
 -0.24752639  0.26383737 -0.11852185  0.34366798 -0.05538724 -0.09480063
  0.01151957  0.21424372  0.00629685 -0.12441132 -0.20403633 -0.05215696
  0.22864334  0.09753177 -0.10274705 -0.2399808  -0.01001993  0.18332914
 -0.02238299  0.0977257  -0.13734534 -0.1922795   0.45375434 -0.16030376
  0.00164344 -0.17975797 -0.31491736 -0.24374908 -0.39661503 -0.3776931
  0.4699513   0.16188343  0.26698908 -0.25087214 -0.20298553 -0.1170556
 -0.30032513 -0.2508713  -0.14116827  0.08885399 -0.15304635  0.22894646
  0.06510173  0.18533343  0.01730995  0.21190093 -0.47146156  0.04427177
 -0.24490424  0.00508768 -0.1573345   0.29002944 -0.20856445 -0.4389199
  0.00357996  0.12701607 -0.00384624  0.04798952  0.00963961 -0.22681303
  0.08799466  0.02418166 -0.10387127  0.17224848  0.10285519  0.26784584
  0.20115513 -0.5575286   0.20655006 -0.05913524 -0.3402288   0.1262
 -0.05128352  0.06775261  0.09565359 -0.08181308  0.131252

In [37]:
similar = model.wv.most_similar('cat', topn=10)
print(similar)

[('fraidy', 0.6252961754798889), ('mouse', 0.6228405237197876), ('meow', 0.603987991809845), ('ev', 0.6021479368209839), ('kittens', 0.5708582997322083), ('dog', 0.5674090385437012), ('pyewacket', 0.5596545934677124), ('skunk', 0.5587746500968933), ('malley', 0.5562594532966614), ('himalayan', 0.5372576117515564)]


In [38]:
max_len = 100
vector_size = 150
X = []


In [39]:
def tokens_to_vectors(tokens, model, max_len, vector_size):
    seq = []
    for word in tokens[:max_len]:
        if word in model.wv:
            seq.append(model.wv[word])
        else:
            seq.append(np.zeros(vector_size))
    while len(seq) < max_len:
        seq.append(np.zeros(vector_size))
    return seq

In [42]:
X_train = np.array([tokens_to_vectors(tokens, model, max_len, vector_size) for tokens in df_train['review']])
X_test  = np.array([tokens_to_vectors(tokens, model, max_len, vector_size) for tokens in df_test['review']])


In [46]:
y_train

47808    1
20154    0
43069    0
19413    0
13673    0
        ..
31092    1
22917    0
47481    0
35597    0
27491    0
Name: sentiment, Length: 40000, dtype: int64

In [60]:
y_train=df_train['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [61]:
y_test=df_test['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [62]:
y_train

47808    1
20154    0
43069    0
19413    0
13673    0
        ..
31092    1
22917    0
47481    0
35597    0
27491    0
Name: sentiment, Length: 40000, dtype: int64

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
X_train_tensor = torch.tensor(X_train,dtype=torch.float32)
X_test_tensor = torch.tensor(X_test,dtype=torch.float32)
y_train_tensor=torch.tensor(y_train.values,dtype=torch.long)
y_test_tensor=torch.tensor(y_test.values,dtype=torch.long)

In [65]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [99]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(SentimentLSTM, self).__init__()
        self.lstm=nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,dropout=dropout)
        self.fc=nn.Linear(hidden_size,num_classes)
        self.dropout=nn.Dropout(dropout)
    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)
        out = h_n[-1]
        out = self.dropout(out)
        out = self.fc(out)
  # сюда пойдут "чистые" логиты
        return out

In [107]:
input_size = vector_size     
hidden_size = 128         
num_layers = 2               
num_classes = 2              
learning_rate = 0.001
num_epochs = 10

In [108]:
model2 = SentimentLSTM(input_size, hidden_size, num_layers, num_classes).to(device)


In [109]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

In [110]:
for epoch in range(num_epochs):
    model2.train()
    total_loss = 0
    for X_batch,y_batch in train_loader:
        optimizer.zero_grad()
        outputs=model2(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward() 
        optimizer.step() 
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

Epoch [1/10], Loss: 0.6890
Epoch [2/10], Loss: 0.6906
Epoch [3/10], Loss: 0.6783
Epoch [4/10], Loss: 0.4090
Epoch [5/10], Loss: 0.3296
Epoch [6/10], Loss: 0.3182
Epoch [7/10], Loss: 0.3078
Epoch [8/10], Loss: 0.3004
Epoch [9/10], Loss: 0.2903
Epoch [10/10], Loss: 0.2848


In [111]:
model2.eval()
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model2(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 88.18%


In [1]:
text = "worst news "
text=clean_text(text)
X_input = tokens_to_vectors(text,model, max_len, vector_size)  # model — это твой Word2Vec
X_input = torch.tensor(X_input, dtype=torch.float32)
X_input = X_input.unsqueeze(0)



model2.eval()
with torch.no_grad():
    outputs = model2(X_input)
    _, predicted = torch.max(outputs, 1)
    print("Предсказанный класс:", predicted.item())

NameError: name 'clean_text' is not defined