In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import os
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
!pip install lightgbm



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model

In [4]:
tf.keras.backend.clear_session()
max_seq_length = 150
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
bert_model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=pooled_output)

In [5]:
bert_model.summary()
#bert_model.output

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [8]:
#getting Vocab file
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [6]:
pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 17.1MB/s eta 0:00:01[K     |▌                               | 20kB 24.3MB/s eta 0:00:01[K     |▉                               | 30kB 29.0MB/s eta 0:00:01[K     |█                               | 40kB 31.1MB/s eta 0:00:01[K     |█▍                              | 51kB 27.7MB/s eta 0:00:01[K     |█▋                              | 61kB 29.7MB/s eta 0:00:01[K     |██                              | 71kB 23.0MB/s eta 0:00:01[K     |██▏                             | 81kB 24.1MB/s eta 0:00:01[K     |██▍                             | 92kB 24.5MB/s eta 0:00:01[K     |██▊                             | 102kB 25.6MB/s eta 0:00:01[K     |███                             | 112kB 25.6MB/s eta 0:00:01

In [9]:
import tokenization
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [18]:
def tokenize(data):
  data_tokens=[]
  data_mask=[]
  data_segment=[]
  max_len_limit=max_seq_length-2
  N=0
  for i in range(len(data.values)):
    tokens=tokenizer.tokenize(data.values[i])
    if len(tokens)>max_len_limit or len(tokens)==max_len_limit:
      tokens=tokens[0:max_seq_length-2]
      tokens=['[CLS]',*tokens,'[SEP]']
      data_mask.append(np.array([1]*len(tokens)))
    if len(tokens)<max_len_limit:
      data_mask.append(np.array([1]*(len(tokens)+2)+[0]*(max_seq_length-len(tokens)-2)))
      for i in range(max_seq_length-len(tokens)-2):
        N=N+1
        tokens.append('[PAD]')
      tokens=['[CLS]',*tokens,'[SEP]']
    data_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
    data_segment.append(np.array([0]*max_seq_length))
  return np.asarray(data_tokens),np.asarray(data_mask),np.asarray(data_segment)

In [11]:
train_data = pd.read_csv('/content/drive/MyDrive/CS2/train.csv/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/CS2/test.csv')

# removing unused columns
train_data.drop(['url_legal', 'license','standard_error'], axis=1, inplace=True)
test_data.drop(['url_legal', 'license'], axis=1, inplace=True)

for i in range(len(train_data)):
  train_data['excerpt'][i]=re.sub('[^A-Za-z0-9]+', ' ', train_data['excerpt'][i]).strip()
for i in range(len(test_data)):
  test_data['excerpt'][i]=re.sub('[^A-Za-z0-9]+', ' ', test_data['excerpt'][i]).strip()

from sklearn.model_selection import train_test_split
X=train_data['excerpt']
y=train_data['target']
#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15,random_state=20)
#x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.15,random_state=20)




In [19]:
x_td=test_data['excerpt']
X_td_tokens,X_td_mask,X_td_segment=tokenize(x_td)
X_td_pooled_output=bert_model.predict([X_td_tokens, X_td_mask, X_td_segment])

In [14]:
X_td_pooled_output.shape

(7, 768)

In [21]:
import numpy as np
from sklearn.model_selection import KFold
models_list=[]
kf=KFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
  x_train, x_cv = X[train_index], X[test_index]
  y_train, y_cv = y[train_index], y[test_index]
  X_train_tokens,X_train_mask,X_train_segment=tokenize(x_train)
  X_cv_tokens,X_cv_mask,X_cv_segment=tokenize(x_cv)
  X_train_pooled_output=bert_model.predict([X_train_tokens, X_train_mask, X_train_segment])
  X_cv_pooled_output=bert_model.predict([X_cv_tokens, X_cv_mask, X_cv_segment])
  from tensorflow.keras.layers import Input, Dense, Activation, Dropout
  from tensorflow.keras.models import Model
  from keras.callbacks import EarlyStopping,TensorBoard
  early_stop_1=EarlyStopping(monitor='val_root_mean_squared_error',patience=10,restore_best_weights=True)
  input_layer = Input(shape=(768,))
  layer1 = Dense(256,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=30))(input_layer)
  Dropout_layer_1=Dropout(0.)(layer1)
  layer2 = Dense(128,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=30))(Dropout_layer_1)
  Dropout_layer_2=Dropout(0.1)(layer2)
  layer3 = Dense(32,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=30))(Dropout_layer_2)
  output = Dense(1,activation='linear',kernel_initializer=tf.keras.initializers.glorot_normal(seed=0))(layer3)
  Model=Model(inputs=input_layer,outputs=output)
  rmse=tf.keras.metrics.RootMeanSquaredError()
  mse = tf.keras.losses.MeanSquaredError()
  Model.compile(optimizer='adam', loss=mse,metrics=[rmse])
  Model.fit(X_train_pooled_output,y_train,epochs=500, validation_data=(X_cv_pooled_output,y_cv), batch_size=10,verbose=2,callbacks=[early_stop_1])
  models_list.append(Model)

Epoch 1/500
227/227 - 1s - loss: 0.8248 - root_mean_squared_error: 0.9082 - val_loss: 0.7676 - val_root_mean_squared_error: 0.8762
Epoch 2/500
227/227 - 1s - loss: 0.5518 - root_mean_squared_error: 0.7429 - val_loss: 0.5767 - val_root_mean_squared_error: 0.7594
Epoch 3/500
227/227 - 1s - loss: 0.5482 - root_mean_squared_error: 0.7404 - val_loss: 0.5172 - val_root_mean_squared_error: 0.7192
Epoch 4/500
227/227 - 1s - loss: 0.5283 - root_mean_squared_error: 0.7268 - val_loss: 0.4987 - val_root_mean_squared_error: 0.7062
Epoch 5/500
227/227 - 1s - loss: 0.4833 - root_mean_squared_error: 0.6952 - val_loss: 0.5537 - val_root_mean_squared_error: 0.7441
Epoch 6/500
227/227 - 1s - loss: 0.4894 - root_mean_squared_error: 0.6996 - val_loss: 0.5509 - val_root_mean_squared_error: 0.7422
Epoch 7/500
227/227 - 1s - loss: 0.4805 - root_mean_squared_error: 0.6931 - val_loss: 0.5154 - val_root_mean_squared_error: 0.7179
Epoch 8/500
227/227 - 1s - loss: 0.4788 - root_mean_squared_error: 0.6920 - val_los

#Predicting Test data

In [24]:
y_pred=np.zeros((X_td_pooled_output.shape[0],1))
for i in range(len(models_list)):
  y_pred=y_pred+models_list[i].predict(X_td_pooled_output)
y_pred=y_pred/len(models_list)
y_pred

array([[-1.25707645],
       [-0.1196163 ],
       [-0.61221824],
       [-2.43381987],
       [-1.55498238],
       [-0.45163122],
       [ 0.13068568]])

In [None]:
test_dat = pd.read_csv('../input/commonlitreadabilityprize/test.csv',index_col='id')
y_pred_pd=pd.DataFrame(y_pred_td,index=test_dat.index,columns=['target'])
y_pred_pd

#saving test data predicted
y_pred_pd.to_csv('./submission.csv')

In [None]:
#Score:0.645