In [16]:
import spacy
import pandas as pd
import numpy as np
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv(r'C:\Users\Admin\ML.COURSE\IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
nlp = spacy.load("en_core_web_md")

In [6]:

df['lemmatized_text'] = df['review'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))



In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_text'], df['sentiment'], test_size=0.2, random_state=42)


In [14]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [17]:
model = SVC(kernel='linear', C=1.0, random_state=7)
model.fit(X_train_tfidf, y_train)


SVC(kernel='linear', random_state=7)

In [18]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8924


In [19]:
from sklearn.metrics import accuracy_score, classification_report
class_report = classification_report(y_test, y_pred, target_names=['negative', 'positive'])

#### Перевіримо класифікатор

In [20]:
example_reviews = [
    "This movie was fantastic! I loved every minute of it.",
    "Terrible movie, I wouldn't recommend it to anyone."
]

example_reviews_tfidf = tfidf.transform(example_reviews)

predicted_sentiments = model.predict(example_reviews_tfidf)

In [21]:
for review, sentiment in zip(example_reviews, predicted_sentiments):
    print(f'Review: "{review}"')
    if sentiment == 'positive':
        print('Sentiment: Positive')
    else:
        print('Sentiment: Negative')
    print()

Review: "This movie was fantastic! I loved every minute of it."
Sentiment: Positive

Review: "Terrible movie, I wouldn't recommend it to anyone."
Sentiment: Negative

