# Easy

In [69]:
import pandas as pd
import numpy as np
import re

In [70]:
data = pd.read_csv("../singapore_airlines_reviews.csv")
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [71]:
# Соединим между собой заголовок и текст отзывов, чтобы обработать всю информацию
data['comment'] = data['title'] + ' ' + data['text']

# Удалим ненужные нам столбцы
data = data.drop(columns=['published_date', 'published_platform', 'type', 'title', 'text', 'helpful_votes'])

data.head()

Unnamed: 0,rating,comment
0,3,Ok We used this airline to go from Singapore t...
1,5,The service in Suites Class makes one feel lik...
2,1,"Don’t give them your money Booked, paid and re..."
3,5,Best Airline in the World Best airline in the ...
4,2,Premium Economy Seating on Singapore Airlines ...


In [72]:
# Будем делить отзывы по рейтингу на положительные (4-5) и отрицательные (1-3)
data['mark'] = data['rating'].apply(lambda x: 0 if x <= 3 else 1)
data = data.drop(columns='rating')
data.head()

Unnamed: 0,comment,mark
0,Ok We used this airline to go from Singapore t...,0
1,The service in Suites Class makes one feel lik...,1
2,"Don’t give them your money Booked, paid and re...",0
3,Best Airline in the World Best airline in the ...,1
4,Premium Economy Seating on Singapore Airlines ...,0


In [73]:
# Уберем из текста пунктуацию и приведём к нижнему регистру
def preprocess_text(text):
    text = re.sub(r"[^\w\s]", '', text.lower())
    return text

data['comment'] = data['comment'].apply(preprocess_text)
data.head()

Unnamed: 0,comment,mark
0,ok we used this airline to go from singapore t...,0
1,the service in suites class makes one feel lik...,1
2,dont give them your money booked paid and rece...,0
3,best airline in the world best airline in the ...,1
4,premium economy seating on singapore airlines ...,0


In [74]:
!pip install nltk



In [75]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

def preprocess_sentence(text):
    return ' '.join([stemmer.stem(word) for word in preprocess_text(text).split()])

data['comment'] = data['comment'].apply(preprocess_sentence)
data.head()

Unnamed: 0,comment,mark
0,ok we use thi airlin to go from singapor to lo...,0
1,the servic in suit class make one feel like vi...,1
2,dont give them your money book paid and receiv...,0
3,best airlin in the world best airlin in the wo...,1
4,premium economi seat on singapor airlin not wo...,0


In [76]:
# Воспользуемся BOW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
train, test = train_test_split(data)
bow = CountVectorizer()
train_x = bow.fit_transform(train['comment']).toarray()
test_x = bow.transform(test['comment']).toarray()

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
model = LogisticRegression()
train_y = train['mark']
test_y = test['mark']
model.fit(train_x, train_y)
test_pred = model.predict(test_x)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
f1_met = f1_score(test_pred, test_y)
print(f"f1_score = {f1_met}")

f1_score = 0.9415566927013197


# Medium

In [79]:
# Удалим стоп-слова
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def delete_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

data['comment'] = data['comment'].apply(delete_stop_words)
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timurabdulkadirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment,mark
0,ok use thi airlin go singapor london heathrow ...,0
1,servic suit class make one feel like vip servi...,1
2,dont give money book paid receiv email confirm...,0
3,best airlin world best airlin world seat food ...,1
4,premium economi seat singapor airlin worth mon...,0


In [80]:
train, test = train_test_split(data)
bow = CountVectorizer()
train_x = bow.fit_transform(train['comment']).toarray()
test_x = bow.transform(test['comment']).toarray()
train_y = train['mark']
test_y = test['mark']

# Попробуем несколько разных моделей:

In [81]:
model_lr = LogisticRegression()
model_lr.fit(train_x, train_y)
test_pred = model_lr.predict(test_x)
f1_met = f1_score(test_pred, test_y)
print(f"f1_score = {f1_met}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


f1_score = 0.9389721627408993


In [82]:
from sklearn.tree import DecisionTreeClassifier
model_dtc = DecisionTreeClassifier()
model_dtc.fit(train_x, train_y)
test_pred = model_dtc.predict(test_x)
f1_met = f1_score(test_pred, test_y)
print(f"f1_score = {f1_met}")

f1_score = 0.8812834224598931


In [83]:
from sklearn.svm import SVC
model_svm = SVC(max_iter=50)
model_svm.fit(train_x, train_y)
test_pred = model_svm.predict(test_x)
f1_met = f1_score(test_pred, test_y)
print(f"f1_score = {f1_met}")



f1_score = 0.8427773343974463


In [84]:
# Лучше всего предсказывает LogisticRegression, протестируем её на моих собственных отзывах

bad_review = 'I used the services of this airline last year and was disappointed by such unacceptable behavior of the staff'
good_review = 'When choosing this airline, I initially treated it with distrust, but later I was pleasantly surprised by the good service and the absence of accidents'


def test_predictions(text):
    text = preprocess_text(text)
    text = preprocess_sentence(text)
    text = delete_stop_words(text)
    df = pd.DataFrame()
    df['comment'] = [text]
    bow_2 = CountVectorizer()
    train_x = bow_2.fit_transform(train['comment']).toarray()
    test_x = bow_2.transform(df['comment']).toarray()
    test_pred = model_lr.predict(test_x)
    return 'good comment' if test_pred[0] else 'bad comment'

print(test_predictions(good_review))
print(test_predictions(bad_review)) 

good comment
bad comment
