In [1]:
# Импортируем необходимые библиотеки
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
# Загружаем датасет
df = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [3]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [4]:
#Добавить разбиение данных на положительные и отрицательные
df.loc[(df.Rating <= 3), 'Rating'] = 0
df.loc[(df.Rating > 3), 'Rating'] = 1

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Misha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:

# Предобработка текстов
# Удаляем пустые строки
df.dropna(inplace=True)
# Приводим тексты к нижнему регистру
df["Review"] = df["Review"].str.lower()
# Удаляем пунктуацию
df["Review"] = df["Review"].str.replace("[^\w\s]", "")
# Удаляем стоп-слова
stop_words = stopwords.words("english")
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

  df["Review"] = df["Review"].str.replace("[^\w\s]", "")


In [7]:
stemmer = SnowballStemmer("english")
df["Review"] = df["Review"].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [8]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,1
1,ok noth special charg diamond member hilton de...,0
2,nice room 4 experi hotel monaco seattl good ho...,0
3,uniqu great stay wonder time hotel monaco loca...,1
4,great stay great stay went seahawk game awesom...,1
...,...,...
20486,best kept secret 3rd time stay charm 5star ca ...,1
20487,great locat price view hotel great quick place...,1
20488,ok look nice modern outsid desk staff nt parti...,0
20489,hotel theft ruin vacat hotel open sept 17 2007...,0


In [9]:
# Векторизация текстов с помощью TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Review"])
y = df["Rating"]

In [10]:
from sklearn.model_selection import train_test_split
# Разбиваем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.model_selection import cross_val_score
#Обучаем модель логистической регрессии
model = LogisticRegression(max_iter=30,n_jobs=-1)
scores = cross_val_score(model, X, y, cv=5)

In [12]:
model.fit(X_train, y_train)

In [13]:
# Предсказываем рейтинги для тестовых данных
y_pred = model.predict(X_test)

In [14]:
# Оцениваем качество модели с помощью метрик и выводим результаты на экран
print("Accuracy:", np.mean(scores))
print("Precision:", np.mean(precision_score(y_test, y_pred, average="macro")))
print("Recall:", np.mean(recall_score(y_test, y_pred, average="macro")))
print("F1-score:", np.mean(f1_score(y_test, y_pred, average="macro")))
#НОВОЕ!!! ОТ 14.05

Accuracy: 0.8940510315041903
Precision: 0.8908621537929345
Recall: 0.8345367628352856
F1-score: 0.8573146284220767


In [15]:
# Оцениваем качество модели с помощью метрик
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro"))
print("Recall:", recall_score(y_test, y_pred, average="macro"))
print("F1-score:", f1_score(y_test, y_pred, average="macro"))

Accuracy: 0.8982678702122469
Precision: 0.8908621537929345
Recall: 0.8345367628352856
F1-score: 0.8573146284220767
