In [50]:
import pandas as pd
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Пользователь\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
df = pd.read_csv('DisneylandReviews.csv')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [4]:
df.columns

Index(['Review_ID', 'Rating', 'Year_Month', 'Reviewer_Location', 'Review_Text',
       'Branch'],
      dtype='object')

In [5]:
df.rename(columns={'Review_Text': 'review'}, inplace=True)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,review,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [20]:
positive_words = ['good', 'great', 'amazing', 'recommend', 'creative', 'fine', 'comfortable', 'pretty', 
                 'as well', 'fantastically', 'terrific', 'love', 'like', 'easy', 'winner', 'perfect', 'nice',
                 'flawlessly', 'happy', 'fast', 'marvel', 'powerful', 'solve', 'bright', 'surprisingly',
                 'excellent', 'outstanding', 'wonderful', 'superb', 'delightful', 'success', 'impressive', 
                 'brilliant', 'charming', 'genius', 'pleasure', 'vibrant', 'thrilling', 'exceptional', 'joyful',
                 'breathtaking', 'captivating', 'sweet', 'magical', 'gorgeous', 'exquisite', 'remarkable']

In [23]:
#разметим на основании экранных правил
def get_sentiment_by_rule(review):
    for word in positive_words:
        if word in review:
            return 'Positive'
    return 'Negative'

In [25]:
#сохраним файл по правилам
labeled_data = df.sample(frac=0.2)
labeled_data['Sentiment'] = labeled_data['review'].apply(get_sentiment_by_rule)
labeled_data.to_csv('DisneylandRule.csv')

In [67]:
#Объединим три датасета
data_1 = pd.read_csv('DisneylandRule.csv')
data_2 = pd.read_csv('LabelStudio.csv')
columns_to_drop = ["Year_Month","annotation_id","annotator","created_at","id","lead_time","updated_at"]
data_2 = data_2.drop(columns=columns_to_drop, axis=1, inplace=True)
data = pd.concat([data_1, data_2])

In [68]:
data.head()

Unnamed: 0.1,Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,review,Branch,Sentiment
0,26911,127669887,3,2012-4,United States,Our bi annual family trip to Disneyland notice...,Disneyland_California,Negative
1,37126,245217045,4,2014-12,United Kingdom,"If you're inti Disney, this is the place to be...",Disneyland_Paris,Negative
2,15345,381269158,3,2015-11,United States,There is no place like Disneyland especially d...,Disneyland_California,Positive
3,32573,458998868,4,2017-2,United Kingdom,Two families we are taking a trip in the winte...,Disneyland_Paris,Negative
4,41063,132540624,5,2011-8,United Kingdom,"Excellent park, busy during summer, lots to do...",Disneyland_Paris,Positive


In [71]:
#проведем обучение модели
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['Sentiment'], test_size=0.25, random_state=42)

In [72]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

In [73]:
pipeline.fit(X_train, y_train)

In [74]:
predictions = pipeline.predict(X_test)

In [75]:
accuracy = accuracy_score(y_test, predictions)
print(f'Точность модели равна {accuracy}')

Точность модели равна 0.8471636193155181


In [76]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

    Negative       0.93      0.24      0.38       417
    Positive       0.84      1.00      0.91      1716

    accuracy                           0.85      2133
   macro avg       0.89      0.62      0.64      2133
weighted avg       0.86      0.85      0.81      2133

