In [None]:
#!pip install Pyphen --no-index --find-links=file:///kaggle/input/roberta/Pyphen-0.9.5-py2.py3-none-any.whl
#!pip install repoze.lru --no-index --find-links=file:///kaggle/input/roberta/repoze.lru-0.7-py3-none-any.whl
#!pip install textstat --no-index --find-links=file:///kaggle/input/roberta/textstat-0.7.0-py3-none-any.whl

In [31]:
import torch
#import textstat
import xgboost as xgb
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [32]:
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [33]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy().ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [44]:
def generate_predictions_train(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy().ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [45]:
preds1_train = generate_predictions_train("../input/a81653/", max_len=256)
preds2_train = generate_predictions_train("../input/a81656/", max_len=256)
preds3_train = generate_predictions_train("../input/a81657/", max_len=256)
preds4_train = generate_predictions_train("../input/a81660/", max_len=256)
preds5_train = generate_predictions_train("../input/a81675/", max_len=192)
preds6_train = generate_predictions_train("../input/a87832/", max_len=256)

In [None]:
preds1_test = generate_predictions("../input/a81653/", max_len=256)
preds2_test = generate_predictions("../input/a81656/", max_len=256)
preds3_test = generate_predictions("../input/a81657/", max_len=256)
preds4_test = generate_predictions("../input/a81660/", max_len=256)
preds5_test = generate_predictions("../input/a81675/", max_len=192)
preds6_test = generate_predictions("../input/a87832/", max_len=256)

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [35]:
#linguistic features
"""
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

ind = np.where(train_df.standard_error == train_df.standard_error.min())[0]
train_df.loc[ind]
train_df.drop(ind, inplace = True)
train_df.reset_index(inplace = True,drop = True)

train_df['character_count'] = train_df['excerpt'].apply(lambda x: len(str(x)))
train_df['digit_count'] = train_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))
train_df['word_count'] = train_df['excerpt'].apply(textstat.lexicon_count)
train_df['unique_word_count'] = train_df['excerpt'].apply(lambda x: len(set(str(x).split())))
train_df['mean_word_length'] = train_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
train_df['syllable_count'] = train_df['excerpt'].apply(textstat.syllable_count)
train_df['sentence_count'] = train_df['excerpt'].apply(textstat.sentence_count)
train_df['flesch_reading_ease'] = train_df['excerpt'].apply(textstat.flesch_reading_ease)
train_df['flesch_kincaid_grade'] = train_df['excerpt'].apply(textstat.flesch_kincaid_grade)
train_df['smog_index'] = train_df['excerpt'].apply(textstat.smog_index)
train_df['automated_readability_index'] = train_df['excerpt'].apply(textstat.automated_readability_index)
train_df['coleman_liau_index'] = train_df['excerpt'].apply(textstat.coleman_liau_index)
train_df['linsear_write_formula'] = train_df['excerpt'].apply(textstat.linsear_write_formula)


X_train = train_df.iloc[:,6:]
y_train = train_df["target"].values

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 1000, verbosity = 1)

xg_reg.fit(X_train,y_train)

test_df['character_count'] = test_df['excerpt'].apply(lambda x: len(str(x)))
test_df['digit_count'] = test_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))
test_df['word_count'] = test_df['excerpt'].apply(textstat.lexicon_count)
test_df['unique_word_count'] = test_df['excerpt'].apply(lambda x: len(set(str(x).split())))
test_df['mean_word_length'] = test_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
test_df['syllable_count'] = test_df['excerpt'].apply(textstat.syllable_count)
test_df['sentence_count'] = test_df['excerpt'].apply(textstat.sentence_count)
test_df['flesch_reading_ease'] = test_df['excerpt'].apply(textstat.flesch_reading_ease)
test_df['flesch_kincaid_grade'] = test_df['excerpt'].apply(textstat.flesch_kincaid_grade)
test_df['smog_index'] = test_df['excerpt'].apply(textstat.smog_index)
test_df['automated_readability_index'] = test_df['excerpt'].apply(textstat.automated_readability_index)
test_df['coleman_liau_index'] = test_df['excerpt'].apply(textstat.coleman_liau_index)
test_df['linsear_write_formula'] = test_df['excerpt'].apply(textstat.linsear_write_formula)


X_test = test_df.iloc[:,4:]
preds7 = xg_reg.predict(X_test)
""";

In [None]:
preds_final_train = np.vstack((preds1_train, preds2_train, preds3_train, preds4_train, preds5_train, preds6_train))
y_train = train_df["target"].values

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 1000, verbosity = 1)

xg_reg.fit(preds_final_train,y_train)

In [42]:
preds_final_test = np.vstack((preds1_test, preds2_test, preds3_test, preds4_test, preds5_test, preds6_test))

array([[-0.16711852, -0.54188776, -0.18852767, -2.72694254, -1.76201248,
        -1.16781425,  0.36099541],
       [-0.07859137, -0.62694353, -0.20510463, -2.31090212, -1.67234659,
        -1.19319785,  0.40452987],
       [ 0.03635367, -0.47886458, -0.33608192, -2.22210574, -1.74552488,
        -0.90777624,  0.37602738],
       [-0.36222428, -0.48514423, -0.55559903, -2.10066557, -1.9371829 ,
        -1.40488291,  0.08838883],
       [-0.19421875, -0.18657285, -0.49220958, -1.73313713, -1.7733413 ,
        -1.59317338, -0.1030577 ],
       [-0.31367698, -0.23227143, -0.2733309 , -2.76938272, -1.93409562,
        -0.99660009,  0.697528  ]])

In [None]:
preds = xg_reg.predict(preds_final_test)

In [None]:
#preds = (0.9*(preds1 + preds2 + preds3 + preds4 + preds5 + preds6) + 0.1*preds7) / 7


In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = preds
submission.to_csv("submission.csv", index=False)