#### I would like to give thank to @ye11725 and @davidjlochner for strong baseline and idea, I have reconstucted the code and added a ensemble of 4 model of LGBM. If you found my work useful, please feel free to give me a upvote. Thank you! 
#### Here is some of my references: 
https://www.kaggle.com/code/davidjlochner/base-tfidf-lgbm

https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments

In [1]:
import re
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import VotingClassifier,VotingRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from tqdm.auto import tqdm,trange
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [2]:
class FeatureEngineering():
    def __init__(self):
        self.columns = [
            (pl.col("full_text").str.split(by="\n\n").alias("paragraph"))
        ]
        self.train_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv').with_columns(self.columns)
        self.test_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv').with_columns(self.columns)
        # feature_eng
        self.sentence_fea = ['sentence_len','sentence_word_cnt']
        # feature_eng
        self.paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
        self.vectorizer = TfidfVectorizer(tokenizer=lambda x: x,
                                          preprocessor=lambda x: x,
                                          token_pattern=None,
                                          strip_accents='unicode',
                                          analyzer = 'word',
                                          ngram_range=(2,3),
                                          min_df=0.05,
                                          max_df=0.9,
                                          sublinear_tf=True  
        )
    def removeHTML(self,x):
        html=re.compile(r'<.*?>')
        return html.sub(r'',x)
    def dataPreprocessing(self,x):
        x = x.lower()             # covert all letter to lower form
        x = self.removeHTML(x)
        x = re.sub("@\w+", '',x)
        x = re.sub("'\d+", '',x)
        x = re.sub("\d+", '',x)
        x = re.sub("http\w+", '',x)
        x = re.sub(r"\s+", " ",x) # replace any sequence of whitespace characters with a sigle whitespace
        x = re.sub(r"\.+", ".",x) # replace any sequence of periods with a sigle periods
        x = re.sub(r"\,+", ",",x) # replace any sequence of commas with a sigle comma
        x = x.strip()
        return x 
    def Paragraph_Preprocess(self,tmp):
        tmp = tmp.explode('paragraph')
        # preprocess
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(self.dataPreprocessing))
        # paragraph_len
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x:len(x)).alias("paragraph_len"))
        # filter
        tmp = tmp.filter(pl.col('paragraph_len')>=25)
        # paragraph_sentence_count/paragraph_word_count
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split("."))).alias("paragraph_sentence_cnt"),
                               pl.col('paragraph').map_elements(lambda x: len(x.split(" "))).alias("paragraph_word_cnt")
                              )
        return tmp
    def Paragraph_Eng(self,train_tmp):
        aggs = [
            # paragraph_len_cnt
            *[pl.col('paragraph').filter(pl.col('paragraph_len')>=i)
            .count().alias(f'paragraph_{i}_cnt') for i in [25,100,200,300,400,500,600,700]],
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.paragraph_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.paragraph_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.paragraph_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.paragraph_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.paragraph_fea],
        ]
        df = train_tmp.group_by(["essay_id"], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Paragraph_Eng +",len(df.columns),"features")
        return df
    def Sentence_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=".").alias("sentence"))
        tmp = tmp.explode('sentence')
        # sentence_len
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
        # filter
        tmp = tmp.filter(pl.col('sentence_len')>=15)
        # sentence_word_cnt
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))

        return tmp
    def Sentence_Eng(self,train_tmp):
        aggs = [
            # sentence_cnt
            *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.sentence_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.sentence_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.sentence_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.sentence_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.sentence_fea],
            ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Sentence_Eng +",len(df.columns),"features")
        return df
    # word feature
    def Word_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=" ").alias("word"))
        tmp = tmp.explode('word')
        # word_len
        tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
        # filter
        tmp = tmp.filter(pl.col('word_len')!=0)

        return tmp
    # feature_eng
    def Word_Eng(self,train_tmp):
        aggs = [
            # word_cnt
            *[pl.col('word').filter(pl.col('word_len') >= i+1)
              .count().alias(f"word_{i+1}_cnt") for i in range(15)], 
            # other
            pl.col('word_len').max().alias(f"word_len_max"),
            pl.col('word_len').mean().alias(f"word_len_mean"),
            pl.col('word_len').std().alias(f"word_len_std"),
            pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
            pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
            pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Word_Eng +",len(df.columns),"features")
        return df
    def process(self):
        tmp = self.Paragraph_Preprocess(self.train_dataset)
        train_feats = self.Paragraph_Eng(tmp)
        train_feats['score'] = self.train_dataset['score']
        
        tmp = self.Sentence_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Sentence_Eng(tmp), on='essay_id', how='left')
        
        tmp = self.Word_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Word_Eng(tmp), on='essay_id', how='left')
        
        train_tfid = self.vectorizer.fit_transform([i for i in self.train_dataset['full_text']])
        dense_matrix = train_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
#         print("----------------------------------------------------------")
        df['essay_id'] = train_feats['essay_id']
        # merge
        train_feats = train_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(train_feats.columns)-2)
        return train_feats
    def process_test(self):
        temp = self.Paragraph_Preprocess(self.test_dataset)
        test_feats = self.Paragraph_Eng(temp)
        
        temp = self.Sentence_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Sentence_Eng(temp), on='essay_id', how='left')
        
        temp = self.Word_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Word_Eng(temp), on='essay_id', how='left')
        
        test_tfid = self.vectorizer.transform([i for i in self.test_dataset['full_text']])
        dense_matrix = test_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
        df['essay_id'] = test_feats['essay_id']
        # merge
        test_feats = test_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(test_feats.columns)-2)
        
        return test_feats

In [3]:
class LGBM():
    def __init__(self):
        self.data_train = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
        self.data_test = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
        self.num_models = 3
        self.acc_metrics = []
        self.cohen_metrics = []
        
        # coef for cohen kappa score
        self.a = 2.948
        self.b = 1.092
        
        self.lgb_parameters = {  
                                 'metrics': 'None',
                                 'objective': self.qwk_obj,
                                 'learning_rate': 0.1,
                                 'max_depth': 5,
                                 'num_leaves': 15, # should be a number smaller than "max_depth"^2
                                 'colsample_bytree': 0.5,
                                 'min_data_in_leaf': 100,
                                 'reg_alpha': 0.8,
                                 'n_estimators': 256,
                                 'verbosity': -1,
#                                  'device' : "gpu"
        }
#         self.catboost_parameters = {
#                                 'iterations': 1000,
#                                 'task_type' : 'GPU',
#                                 'learning_rate': 0.1,
#                                 'depth': 6,
#                                 'loss_function': 'MultiClass',
#                                 'verbose': 0
#         }

        self.model = VotingRegressor(
            estimators = [(f"lgb_{i}",lgb.LGBMRegressor(**self.lgb_parameters, random_state=i+40),)for i in range(self.num_models)
#                           (f"cb_{i}",CatBoostClassifier(**self.catboost_parameters, random_state=i+40),)for i in range(self.num_models)
                         ],n_jobs=-1
#                             voting = "soft"
        )
        
    def quadratic_weighted_kappa(self,y_true,y_pred):
        y_true = y_true + self.a
        y_pred = (y_pred + self.a).clip(1,6).round()
#         print(y_true)
#         print(y_pred)
        qwk = cohen_kappa_score(y_true,y_pred,weights='quadratic')
        
        return "QWK",qwk,True
    def qwk_obj(self,y_true,y_pred):
        labels = y_true + self.a
        preds = y_pred + self.a
        preds = preds.clip(1,6)
        f = 1/2 * np.sum((preds-labels)**2)
        g = 1/2 * np.sum((preds-self.a)**2+self.b)
        df = preds - labels
        dg = preds - self.a
        grad = (df/g - f*dg/g**2)*len(labels)
        hess = np.ones(len(labels))
        
        return grad,hess
    def fit(self,df,fold):
        feature_names = list(filter(lambda x: x not in ['essay_id','score'], df.columns))
        x= df
        y= df['score'].values
        kfold = KFold(n_splits=5, random_state=44, shuffle=True)
        
        for fold_id, (trn_idx, val_idx) in tqdm(enumerate(kfold.split(x.copy(), y.copy().astype(str)))):
#             if fold_id != fold:
#                 break
            X_train = df.iloc[trn_idx][feature_names]
            Y_train = df.iloc[trn_idx]['score'] - self.a

            X_val = df.iloc[val_idx][feature_names]
            Y_val = df.iloc[val_idx]['score'] - self.a
            print('\nFold_{} Training ================================\n'.format(fold_id))
            
            self.model.fit(X_train,
                           Y_train,
                           )
            pred_val = self.model.predict(X_val)
#             print(pred_val)
            df_tmp = df.iloc[val_idx][['essay_id', 'score']].copy()
            df_tmp['pred'] = pred_val
            cohen_score = self.quadratic_weighted_kappa(Y_val.values, df_tmp['pred'])
#             accuracy = accuracy_score(Y_val.values,  df_tmp['pred'].clip(1, 6).round())
#             self.acc_metrics.append(accuracy)
            self.cohen_metrics.append(cohen_score[1])
#             print(f"Accuracy fold {fold_id}: {accuracy:.4f}")
#             print(cohen_score)
#             print(f"Cohen score fold {fold_id}: {cohen_score[1]:.4f}")
#         average_accuracy = np.mean(self.acc_metrics)
        average_cohen = np.mean(self.cohen_metrics)
        
#         print(f'Average Accuracy all fold: {average_accuracy:.4f}')
        print(f'Average Cohen all fold: {average_cohen:.4f}')

    def predict(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id'], df.columns))
        predictions = self.model.predict(df[feature_names]+self.a).clip(1, 6).round()
        return predictions
    def submit(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id'], df.columns))
        return self.data_test.select('essay_id').with_columns(score = (self.model.predict(df[feature_names])+self.a).clip(1, 6).round())

In [4]:
FE = FeatureEngineering()
train_feature = FE.process()
test_feature = FE.process_test()

done Paragraph_Eng + 24 features
done Sentence_Eng + 18 features
done Word_Eng + 22 features
feature_num:  3235
done Paragraph_Eng + 24 features
done Sentence_Eng + 18 features
done Word_Eng + 22 features
feature_num:  3234


In [5]:
model = LGBM()
model.fit(df=train_feature,fold=0)

0it [00:00, ?it/s]





[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function


[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function


[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function


[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
Average Cohen all fold: 0.8002


In [6]:
submission = model.submit(test_feature)
display(submission)
submission.write_csv('submission.csv')

essay_id,score
str,f64
"""000d118""",2.0
"""000fe60""",3.0
"""001ab80""",4.0
