## Introduction

In [1]:
!python -m pip install "../input/roberta/sentence-transformers/"
!pip install Pyphen --no-index --find-links=file:///kaggle/input/roberta/Pyphen-0.9.5-py2.py3-none-any.whl
!pip install repoze.lru --no-index --find-links=file:///kaggle/input/roberta/repoze.lru-0.7-py3-none-any.whl
!pip install textstat --no-index --find-links=file:///kaggle/input/roberta/textstat-0.7.0-py3-none-any.whl

Processing /kaggle/input/roberta/sentence-transformers
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-1.1.0-py3-none-any.whl size=126496 sha256=e092021b9ff273f97d52ccfc3cb8fef7d0e387af33e3b7c8b483c5931ccbca8d
  Stored in directory: /root/.cache/pip/wheels/86/8c/ce/e33b1f3855d444c071cfabd0fc7880faef2b93e5dea2208f90
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-1.1.0


In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
import textstat

In [12]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [13]:
ind = np.where(train_df.standard_error == train_df.standard_error.min())[0]
train_df.loc[ind]

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
106,436ce79fe,,,"The sun was shining in a cloudless sky, and no...",0.0,0.0


This row has a target which looks like an integer, and a 0 standard error.  
We'll remove it for now, as the standard error is largely out of distribution, which could affect dimensionality reduction

In [14]:
train_df.drop(ind, inplace = True)
train_df.reset_index(inplace = True,drop = True)

## Obtaining Sentence Representations

In [22]:
from sentence_transformers import SentenceTransformer, models

In [27]:
model_path = "../input/roberta/roberta-base/roberta-base"
word_embedding_model = models.Transformer(model_path, max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
roberta = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [28]:
# bert = SentenceTransformer('bert-base-uncased')
# roberta = SentenceTransformer('../input/roberta/roberta-base/roberta-base')
vects = roberta.encode(train_df.excerpt)

Batches:   0%|          | 0/89 [00:00<?, ?it/s]

In [22]:
#mpnet = SentenceTransformer("stsb-mpnet-base-v2")
#vects = mpnet.encode(train_df.excerpt)

In [56]:
# Probably isn't neccesary to scale these vectors
#scaler = StandardScaler()
#vects = scaler.fit_transform(vects)

##  Feature Engineering
I used the same textstat augmentations from this excellent EDA notebook https://www.kaggle.com/gunesevitan/commonlit-readability-prize-eda  
For reference, the augmentations are defined below:
* `character_count` - number of characters in the text
* `digit_count` - number of digits in the text
* `word_count` - number of words in the text
* `unique_word_count` - number of unique words in the text
* `mean_word_length` - average number of character that the words have in the text
* `syllable_count` - number of syllables in the text
* `sentence_count` - number of sentences in the text
* `flesch_reading_ease` - [flesch reading ease score](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) of the text
* `flesch_kincaid_grade` - [flesch-kincaid grade level](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level) of the text
* `smog_index` - [smog index](https://en.wikipedia.org/wiki/SMOG) of the text
* `automated_readability_index` - [automated readability index](https://en.wikipedia.org/wiki/Automated_readability_index) of the text
* `coleman_liau_index` - [coleman–liau index](https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index) of the text
* `linsear_write_formula` - [linsear write grade](hhttps://en.wikipedia.org/wiki/Linsear_Write) of the text

In [29]:
# label encoder on license?
#train_df['is_licensed'] = train_df.license.notna()*1 # might be interesting to look at?

train_df['character_count'] = train_df['excerpt'].apply(lambda x: len(str(x)))
train_df['digit_count'] = train_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))
train_df['word_count'] = train_df['excerpt'].apply(textstat.lexicon_count)
train_df['unique_word_count'] = train_df['excerpt'].apply(lambda x: len(set(str(x).split())))
train_df['mean_word_length'] = train_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
train_df['syllable_count'] = train_df['excerpt'].apply(textstat.syllable_count)
train_df['sentence_count'] = train_df['excerpt'].apply(textstat.sentence_count)
train_df['flesch_reading_ease'] = train_df['excerpt'].apply(textstat.flesch_reading_ease)
train_df['flesch_kincaid_grade'] = train_df['excerpt'].apply(textstat.flesch_kincaid_grade)
train_df['smog_index'] = train_df['excerpt'].apply(textstat.smog_index)
train_df['automated_readability_index'] = train_df['excerpt'].apply(textstat.automated_readability_index)
train_df['coleman_liau_index'] = train_df['excerpt'].apply(textstat.coleman_liau_index)
train_df['linsear_write_formula'] = train_df['excerpt'].apply(textstat.linsear_write_formula)

In [30]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error as mse

In [31]:
vect_df = pd.DataFrame(vects)
final_df = pd.concat([train_df.iloc[:,6:], vect_df], axis=1)

In [32]:
X_train = final_df.values
y_train = train_df["target"].values
#X_train, X_val, y_train, y_val = train_test_split(vects, train_df["target"].values , test_size=0.10, random_state=42)

In [40]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,
                max_depth = 2, alpha = 10, n_estimators = 1000, verbosity = 1)

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=2, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

In [41]:
#preds = xg_reg.predict(X_val)

In [42]:
#np.sqrt(mse(y_val, preds))

0.5704950188952478

# TEST

In [34]:
#vects_test = mpnet.encode(test_df.excerpt)
vects_test = roberta.encode(test_df.excerpt)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
#test_df['is_licensed'] = test_df.license.notna()*1 # might be interesting to look at?
test_df['character_count'] = test_df['excerpt'].apply(lambda x: len(str(x)))
test_df['digit_count'] = test_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))
test_df['word_count'] = test_df['excerpt'].apply(textstat.lexicon_count)
test_df['unique_word_count'] = test_df['excerpt'].apply(lambda x: len(set(str(x).split())))
test_df['mean_word_length'] = test_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
test_df['syllable_count'] = test_df['excerpt'].apply(textstat.syllable_count)
test_df['sentence_count'] = test_df['excerpt'].apply(textstat.sentence_count)
test_df['flesch_reading_ease'] = test_df['excerpt'].apply(textstat.flesch_reading_ease)
test_df['flesch_kincaid_grade'] = test_df['excerpt'].apply(textstat.flesch_kincaid_grade)
test_df['smog_index'] = test_df['excerpt'].apply(textstat.smog_index)
test_df['automated_readability_index'] = test_df['excerpt'].apply(textstat.automated_readability_index)
test_df['coleman_liau_index'] = test_df['excerpt'].apply(textstat.coleman_liau_index)
test_df['linsear_write_formula'] = test_df['excerpt'].apply(textstat.linsear_write_formula)

In [36]:
vect_df_test = pd.DataFrame(vects_test)
final_df_test = pd.concat([test_df.iloc[:,4:].head(), vect_df_test], axis=1)

In [37]:
X_test = final_df_test.values

In [41]:
y_test = xg_reg.predict(X_test)

  "because it will generate extra copies and increase " +


In [42]:
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = y_test
predictions.to_csv("submission.csv", index=False)
predictions

Unnamed: 0,id,target
0,c0f722661,-0.670114
1,f0953f0a5,-0.224144
2,0df072751,-0.437058
3,04caf4e0c,-2.376087
4,0e63f8bea,-1.75882
5,12537fe78,-0.24813
6,965e592c0,0.196202
