<a href="https://colab.research.google.com/github/Jack0karev/konoha/blob/master/Jet_Brains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [0]:
from google.colab import files
files.upload()

In [0]:
!cp kaggle.json  ~/.kaggle/

In [0]:
!kaggle datasets download -d ehallmar/reddit-comment-score-prediction
!ls

In [0]:
!unzip reddit-comment-score-prediction.zip

In [0]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

SEED = 43
np.random.seed(SEED)

In [0]:
columns = ["text", "parent_text", "score"]
df = pd.concat([
    pd.read_csv("comments_positive.csv", usecols=columns, na_filter=False),
    pd.read_csv("comments_negative.csv", usecols=columns, na_filter=False)
], ignore_index=True)

In [0]:
y = df['score']
df.drop(columns='score', inplace=True)
X = df

In [0]:
X.head()

In [0]:
cols = ['text', 'parent_text']
X['combined'] = X[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [0]:
X_total = X.combined

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_total, y, test_size=0.05, random_state=SEED)

# To be sure we don't use indices to predict something
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

# Check

In [0]:
y_pred = np.full(y_test.shape, y_train.mean())


In [0]:
mean_squared_error(y_test, y_pred)


# Code

In [0]:
X_train = X_train[:1600000]

In [0]:
import string
import re


from IPython.display import Image
from IPython.core.display import HTML 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [0]:
def delete_punctuation(x):
    punctuation = list(string.punctuation)
    return ''.join([a if a not in punctuation + ['\n'] else ' ' for a in x])

In [0]:
# приведение всех слов к нижнему регистру
X_train = X_train.apply(lambda x: x.lower())
X_test = X_test.apply(lambda x: x.lower())
# удаление пунктуации
X_train = X_train.apply(delete_punctuation)
X_test = X_test.apply(delete_punctuation)

In [0]:
!wget http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip

In [0]:
!pip install tensorflow-gpu==1.15
from keras.layers import Dense, Input, GRU, Embedding, Dropout, Bidirectional
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler
from tqdm import tqdm_notebook

In [0]:
import os, zipfile
file_name = os.path.abspath('./glove.42B.300d.zip') # get full path of files
zip_ref = zipfile.ZipFile(file_name) # create zipfile object
zip_ref.extractall('./') # extract file to dir
zip_ref.close() # close file
os.remove(file_name) # delete zipped file

In [0]:
f = open('./glove.42B.300d.txt')

embeddings_index = dict() 
for line in tqdm_notebook(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
embed_size = 300 # how big is each word vector
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 150 # max number of words in a comment to use

list_sentences_train = X_train.values
y = y_train.values
list_sentences_test = X_test.values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
x_train_pad = pad_sequences(list_tokenized_train, maxlen=maxlen)
x_test_pad = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
emb_mean, emb_std

In [0]:
word_index = tokenizer.word_index
unknown_words = set()
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    else: unknown_words.add(word)

In [0]:
!pip install tensorflow-gpu==1.15

In [0]:
input_layer = Input((maxlen,), name = 'comment_text')
embedding_layer = Embedding(max_features, embed_size, input_length=maxlen, 
                            weights=[embedding_matrix], 
                            trainable = False)(input_layer)
x = Bidirectional(GRU(128, return_sequences=True))(embedding_layer)
x = Dropout(0.3)(x)
x = Bidirectional(GRU(128, return_sequences=False))(x)
x = Dense(64, activation="relu")(x)
output_layer = Dense(1, activation="linear")(x)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='mean_squared_error',
                  optimizer=Adam(clipvalue=1, clipnorm=1),
                  metrics=['accuracy'])
print(model.summary())

def schedule(ind):
    a = [0.001, 0.001, 0.0001, 0.0001, 0.00001, 0.00001, 0.000001]
    return a[ind]

lr = LearningRateScheduler(schedule)
    
early_stop = EarlyStopping(monitor='val_loss',
                           patience=4,
                           verbose=1,
                           min_delta=1e-4)


history = model.fit(x_train_pad, y[:1600000], batch_size=256, epochs = 3, 
                    validation_split = 0.1, verbose = 1, callbacks = [lr])

In [0]:
predict = model.predict(x_test_pad)

In [0]:
mean_squared_error(y_test, predict)