## Feature augmentation : Universal Sentence Encoder¶


In [3]:
# Importing necessary libraries
import gc
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import spacy
import string
import random
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier,BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.metrics import cohen_kappa_score
from lightgbm import log_evaluation, early_stopping
from sklearn.linear_model import SGDClassifier
import polars as pl
import joblib

In [4]:
PATH = "kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train = pd.read_csv(PATH + "train.csv")

### 新加特征1：句子，单词

In [14]:
#Features engineering
#Preprocessing

def removeHTML(x):
    html=re.compile(r'<.*?\n>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    # lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Delete aposhtroph html
    #x = re.sub(r"\\'", "'", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

# Paragraph preprocessing
train['paragraph_processed'] = [dataPreprocessing(x) for x in train['full_text']]

# Calculate total number of sentences
train['sentence_cnt'] = [len(x.split('.')) for x in train['paragraph_processed']]

# Calculate total number of words
train['word_cnt'] = [len(x.split(' ')) for x in train['paragraph_processed']]

In [15]:
# Function to Calculate statistical parameters of paragraphs, sentences, and words
import statistics


def corpus_satistics(data, col, heading_len, split_str, corp_unit):
    corp_unit_len_min = []
    corp_unit_len_max = []
    corp_unit_len_mean = []
    corp_unit_len_median = []
    corp_unit_len_sd = []
    corp_unit_len_quantiles =[]
    
    for z in data[col]:
        corpLen_cnt = []
        for y in z.split(split_str):
            if corp_unit=='word':
                x=len(y.split(' '))
                if x>3: # Paragraph heading should be limited to 3 words
                    corpLen_cnt.append(x)
            else:
                if len(y)>heading_len: # Paragraph heading should be limited to 15-20 characters
                    corpLen_cnt.append(len(y))

        corp_unit_len_min.append(min(corpLen_cnt))
        corp_unit_len_max.append(max(corpLen_cnt))
        corp_unit_len_mean.append(statistics.mean(corpLen_cnt))
        corp_unit_len_median.append(statistics.median(corpLen_cnt))
        if len(corpLen_cnt)>=2: # As some full_texts have just one paragraph
            corp_unit_len_sd.append(statistics.stdev(corpLen_cnt))
            qua = statistics.quantiles(corpLen_cnt, n=10, method='exclusive')
            qua = [0 if i < 0 else i for i in qua]
            corp_unit_len_quantiles.append(qua)
        else:
            corp_unit_len_sd.append(corpLen_cnt[0]) # sd for single paragraph/sentence entries are kept as large 
            corp_unit_len_quantiles.append([0]*9) # quantiles for single paragraph/sentence entries are kept zero



    data[corp_unit + '_len_min'] = corp_unit_len_min
    data[corp_unit + '_len_max'] = corp_unit_len_max
    data[corp_unit + '_len_mean'] = corp_unit_len_mean
    data[corp_unit + '_len_median'] = corp_unit_len_median
    data[corp_unit + '_len_sd'] = corp_unit_len_sd
    data[corp_unit + '_len_qua0'] = [x[0] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua1'] = [x[1] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua2'] = [x[2] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua3'] = [x[3] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua4'] = [x[4] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua5'] = [x[5] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua6'] = [x[6] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua7'] = [x[7] for x in corp_unit_len_quantiles]
    data[corp_unit + '_len_qua8'] = [x[8] for x in corp_unit_len_quantiles]

    return data

# Statistics for paragraph

data = train
col = 'full_text'
heading_len = 20
split_str = '\n\n'
corp_unit = 'paragraph'

train = corpus_satistics(data, col, heading_len, split_str, corp_unit)

# Statistics for sentence

data = train
col = 'paragraph_processed'
heading_len = 15
split_str = '.'
corp_unit = 'sentence'

train = corpus_satistics(data, col, heading_len, split_str, corp_unit)

# Statistics for word

data = train
col = 'paragraph_processed'
#heading_len = 15
split_str = '.'
corp_unit = 'word'

train = corpus_satistics(data, col, heading_len, split_str, corp_unit)

In [1]:
import tensorflow_hub as hub

embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2")

sentence_encoder = hub.KerasLayer(embed)

2024-06-18 23:19:26.010874: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-18 23:19:26.050342: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 23:19:26.050373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 23:19:26.051871: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-18 23:19:26.059165: I tensorflow/core/platform/cpu_feature_guar

tf.Tensor(
[[-0.03133019 -0.06338634 -0.016075   ... -0.0324278  -0.0457574
   0.05370456]
 [ 0.05080861 -0.01652431  0.01573778 ...  0.00976659  0.03170121
   0.01788118]], shape=(2, 512), dtype=float32)


In [None]:
## 在这里如果要使用模型的话，先下载模型，并且使用以下代码
##https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2

# sentence_encoder = hub.KerasLayer(
#     #"https://tfhub.dev/google/universal-sentence-encoder/4"
#     '/kaggle/input/universal-sentence-encoder/tensorflow2/universal-sentence-encoder/2'
# )

In [10]:
# universal sentence encoder function
import tensorflow as tf
import math

def use_function(corpus, column_name):

    sencode_corpus = []
    for x in corpus[column_name]:

      
        if len(x.split('.'))<2:
            sencode_essay = [0.]*512

        else:
            enc_raw = sentence_encoder(x.split('.'))[:-1]
            sencode_essay = tf.math.reduce_sum(enc_raw, 0).numpy()/math.sqrt(len(x.split('.')))

    

        sencode_corpus.append(sencode_essay)
               

    return sencode_corpus

In [11]:
# train
corpus = train
column_name = 'full_text'
sencode_corpus = use_function(corpus, column_name)
sencode = pd.DataFrame(sencode_corpus)
# rename features
sencode_columns = [ f'sencode_{i}' for i in range(len(sencode.columns))]
sencode.columns = sencode_columns
# Merge the newly generated feature data with the previously generated feature data
sencode['essay_id'] = train['essay_id']
train = train.merge(sencode, on='essay_id', how='left')

### inference


In [None]:
# test
# Paragraph preprocessing
test['paragraph_processed'] = [dataPreprocessing(x) for x in test['full_text']]
# Calculate the number of sentences
test['sentence_cnt'] = [len(x.split('.')) for x in test['paragraph_processed']]
# Calculate the number of words
test['word_cnt'] = [len(x.split(' ')) for x in test['paragraph_processed']]

# Statistics for paragraph
data = test
col = 'full_text'
heading_len = 20
split_str = '\n\n'
corp_unit = 'paragraph'

test = corpus_satistics(data, col, heading_len, split_str, corp_unit)

# Statistics for sentence
data = test
col = 'paragraph_processed'
heading_len = 15
split_str = '.'
corp_unit = 'sentence'

test = corpus_satistics(data, col, heading_len, split_str, corp_unit)

# Statistics for word
data = test
col = 'paragraph_processed'
#heading_len = 15
split_str = '.'
corp_unit = 'word'

test = corpus_satistics(data, col, heading_len, split_str, corp_unit)
# Tfidf and merge
test_tfid = vectorizer.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test['essay_id']
test = test.merge(df, on='essay_id', how='left')
# universal sentence encoder
# test 
corpus = test
column_name = 'full_text'
sencode_corpus = use_function(corpus, column_name)
sencode = pd.DataFrame(sencode_corpus)
# rename features
sencode_columns = [ f'sencode_{i}' for i in range(len(sencode.columns))]
sencode.columns = sencode_columns
# Merge the newly generated feature data with the previously generated feature data
sencode['essay_id'] = test['essay_id']
test = test.merge(sencode, on='essay_id', how='left')
# deberta
for i in range(6):
    test[f'deberta_oof_{i}'] = predicted_score[:, i]
test.shape

In [None]:
# prediction
prediction = test[['essay_id']].copy()
prediction['score'] = 0
pred_test = models[0].predict(test[feature_names]) + a
for i in range(4):
    pred_now = models[i+1].predict(test[feature_names]) + a
    pred_test = np.add(pred_test,pred_now)
# The final prediction result needs to be divided by 5 because the prediction results of 5 models were added together
pred_test = pred_test/5
print(pred_test)

In [None]:
# Round the prediction result to an integer and limit it to a range of 1-6 (score range)
pred_test = pred_test.clip(1, 6).round()
prediction['score'] = pred_test
prediction.to_csv('submission.csv', index=False)
prediction.head(3)