In [45]:
# %pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [46]:
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#### Get the data

In [47]:
df = pd.read_csv('threads_reviews.csv')

In [48]:
df.shape

(32910, 4)

In [49]:
df.head()

Unnamed: 0,source,review_description,rating,review_date
0,Google Play,Meh. Not the greatest experience on a Chromebo...,2,2023-07-08 14:18:24
1,Google Play,Pretty good for a first launch!! Its easy to u...,3,2023-07-19 20:52:48
2,Google Play,"For a brand new app, it's very well optimized....",3,2023-07-06 23:03:11
3,Google Play,"Great app with a lot of potential! However, th...",3,2023-07-10 00:53:25
4,Google Play,"The app is good, but it needs a lot of functio...",3,2023-07-06 16:57:43


#### Preposition Data

In [50]:
Data = df[['source', 'review_description']]

In [51]:
Data.shape

(32910, 2)

In [52]:
Data.columns

Index(['source', 'review_description'], dtype='object')

In [53]:
Data.head()

Unnamed: 0,source,review_description
0,Google Play,Meh. Not the greatest experience on a Chromebo...
1,Google Play,Pretty good for a first launch!! Its easy to u...
2,Google Play,"For a brand new app, it's very well optimized...."
3,Google Play,"Great app with a lot of potential! However, th..."
4,Google Play,"The app is good, but it needs a lot of functio..."


In [54]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32910 entries, 0 to 32909
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   source              32910 non-null  object
 1   review_description  32910 non-null  object
dtypes: object(2)
memory usage: 514.3+ KB


In [55]:
Data["source"] = Data["source"].replace({'Google Play':0, 'App Store':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Data["source"] = Data["source"].replace({'Google Play':0, 'App Store':1})


In [56]:
Data.head()

Unnamed: 0,source,review_description
0,0,Meh. Not the greatest experience on a Chromebo...
1,0,Pretty good for a first launch!! Its easy to u...
2,0,"For a brand new app, it's very well optimized...."
3,0,"Great app with a lot of potential! However, th..."
4,0,"The app is good, but it needs a lot of functio..."


In [57]:
Data = Data[['source','review_description']]
print(Data['review_description'][0])

Meh. Not the greatest experience on a Chromebook. Seems to be customized for phones only. Opens in a little screen that you can't expand or resize - for reasons that are a complete mystery to me. Judging from the fact that every other app I know of is resizeable, this seems like it was a conscious choice by the developers . Why you'd do something like this is beyond understanding and suggests a control freak approach. Not a great way to make a first impression.


#### Tokenization

In [58]:
words = word_tokenize(Data['review_description'][0])
words

['Meh',
 '.',
 'Not',
 'the',
 'greatest',
 'experience',
 'on',
 'a',
 'Chromebook',
 '.',
 'Seems',
 'to',
 'be',
 'customized',
 'for',
 'phones',
 'only',
 '.',
 'Opens',
 'in',
 'a',
 'little',
 'screen',
 'that',
 'you',
 'ca',
 "n't",
 'expand',
 'or',
 'resize',
 '-',
 'for',
 'reasons',
 'that',
 'are',
 'a',
 'complete',
 'mystery',
 'to',
 'me',
 '.',
 'Judging',
 'from',
 'the',
 'fact',
 'that',
 'every',
 'other',
 'app',
 'I',
 'know',
 'of',
 'is',
 'resizeable',
 ',',
 'this',
 'seems',
 'like',
 'it',
 'was',
 'a',
 'conscious',
 'choice',
 'by',
 'the',
 'developers',
 '.',
 'Why',
 'you',
 "'d",
 'do',
 'something',
 'like',
 'this',
 'is',
 'beyond',
 'understanding',
 'and',
 'suggests',
 'a',
 'control',
 'freak',
 'approach',
 '.',
 'Not',
 'a',
 'great',
 'way',
 'to',
 'make',
 'a',
 'first',
 'impression',
 '.']

#### Stop words filtration & remove punctuations

In [59]:
def clean_text(text): 
    text = str(text) # Конвертація вхідного тексту у строку
    text = text.lower() #Перетворення тексту у нижній регістр
    text = re.sub("\d", " ", text) #Видалення всіх цифр з тексту та заміна їх на пробіли
    text = re.sub("@\S+", " ", text) #Видалення слів, які починаються з символу "@" та заміна їх на пробіли
    text = re.sub("https*\S+", " ", text) #Видалення посилань та заміна на пробіли
    text = re.sub("#\S+", " ", text) #Видалення слів, які починаються з символу "#"
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) #Видалення всіх знаків пунктуації
    text = re.sub('\n', ' ', text) #Видалення символів нового рядка
    text = re.sub('\s{2,}',' ', text)  #Заміна двох або більше пробілів на один
    stop_words = stopwords.words("english") #Створення списку стоп-слів для англ. мови за допомогою бібліотеки nltk
    text = ' '.join([word for word in text.split(' ') if word not in stop_words]) #Розбиття тексту на слова, видалення стоп-слів і з'єднання залишених слів назад в текст
    return text

In [60]:
Data['review_description'] = [clean_text(review) for review in Data['review_description']]
Data

Unnamed: 0,source,review_description
0,0,meh greatest experience chromebook seems custo...
1,0,pretty good first launch easy use self explana...
2,0,brand new app well optimized however missing q...
3,0,great app lot potential however lot needs fixe...
4,0,app good needs lot functionality example searc...
...,...,...
32905,1,killed dog mark zuckerburg strangled dog gone
32906,1,add search hashtag like twitter
32907,1,bad twister
32908,1,yet another trash meta


### RNN (classification of texts (with which I worked in laboratory No. 2) using a recurrent neural network)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(Data['review_description'], Data['source'], test_size=0.30)

In [62]:
vocab_size = 10000
oov_token = ""
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

In [63]:
X_train_text_sequences = tokenizer.texts_to_sequences(X_train)
X_test_text_sequences = tokenizer.texts_to_sequences(X_test)

In [64]:
max_length = 100
padding_type = "post"
trunction_type="post"
X_train = keras.utils.pad_sequences(X_train_text_sequences,maxlen=max_length, padding=padding_type, truncating=trunction_type)
X_test = keras.utils.pad_sequences(X_test_text_sequences,maxlen=max_length, padding=padding_type, truncating=trunction_type)

In [65]:
X_train.shape

(23037, 100)

In [66]:
emb_dim = 128

model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, emb_dim, input_length = max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='selu'))

2024-01-05 00:13:26.947956: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-05 00:13:26.950144: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-05 00:13:26.953327: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [67]:
model.compile(optimizer='adam',
              loss = tf.keras.losses.binary_crossentropy,
              metrics=['accuracy']
)

In [68]:
model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test))

Epoch 1/2


2024-01-05 00:13:27.680336: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-05 00:13:27.683437: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-05 00:13:27.686420: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-01-05 00:14:29.704115: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-05 00:14:29.708519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-05 00:14:29.710386: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x139993970>

In [69]:
loss, accuracy = model.evaluate(X_test, y_test)
print(loss)
print(accuracy)

0.4351247251033783
0.9199838042259216
