In [10]:
# Decomment this line of code if you already have those downloaded
!pip install --no-index --no-deps /kaggle/input/aes-whls/aes_whls/pyphen-0.15.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/aes-whls/aes_whls/textstat-0.7.3-py3-none-any.whl

Processing /kaggle/input/aes-whls/aes_whls/pyphen-0.15.0-py3-none-any.whl
pyphen is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/aes-whls/aes_whls/textstat-0.7.3-py3-none-any.whl
textstat is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Basic libary
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
import re
import optuna
import textstat
from optuna.samplers import TPESampler
# cmap = plt.cm.get_cmap('coolwarm')
import warnings
warnings.filterwarnings('ignore')


# Use for pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, f1_score, make_scorer

# Use for training model
from scipy.stats import randint
from nltk.tokenize import word_tokenize
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier 
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [13]:
essay_id_dropped = df_train['essay_id']
df_train = df_train.drop('essay_id', axis = 1)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   full_text  17307 non-null  object
 1   score      17307 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 270.5+ KB


In [14]:
df_train.head(5)

Unnamed: 0,full_text,score
0,Many people have car where they live. The thin...,3
1,I am a scientist at NASA that is discussing th...,3
2,People always wish they had the same technolog...,4
3,"We all heard about Venus, the planet without a...",4
4,"Dear, State Senator\n\nThis is a letter to arg...",3


In [15]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = re.sub(r'\xa0', '', x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [16]:
def textstat_features(text):
    features = {}
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    features['smog_index'] = textstat.smog_index(text)
    features['coleman_liau_index'] = textstat.coleman_liau_index(text)
    features['automated_readability_index'] = textstat.automated_readability_index(text)
    features['dale_chall_readability_score'] = textstat.dale_chall_readability_score(text)
    features['difficult_words'] = textstat.difficult_words(text)
    features['linsear_write_formula'] = textstat.linsear_write_formula(text)
    features['gunning_fog'] = textstat.gunning_fog(text)
    features['text_standard'] = textstat.text_standard(text, float_output=True)
    features['spache_readability'] = textstat.spache_readability(text)
    features['mcalpine_efg_time'] = textstat.reading_time(text)
    features['syllablaw'] = textstat.mcalpine_eflaw(text)
    features['readinle_count'] = textstat.syllable_count(text)
    features['lexicon_count'] = textstat.lexicon_count(text)
    features['monosyllabcount'] = textstat.monosyllabcount(text)
    return features

In [17]:
# Number of words
import string
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df['word_count'] = df['full_text'].apply(lambda x: len(x.split()))

    # Length
    df['essay_length'] = df['full_text'].str.len()

    # Sentences count
    # Adding a new column 'sentences_count' that counts the sentences in 'full_text'
    df['sentences_count'] = df['full_text'].str.count(r'\.')

    # Paragraph count
    # Adding a new column 'paragraph_count' that counts the paragraphs in 'full_text'
    df['paragraph_count'] = df['full_text'].str.count(r'\n') + 1
    
    df["text_tokens"] = df["full_text"].apply(lambda x: word_tokenize(x))
    df["word_count"] = df["text_tokens"].apply(lambda x: len(x))
    df["unique_word_count"] = df["text_tokens"].apply(lambda x: len(set(x)))
    df.drop(columns=["text_tokens"], inplace=True)
    
    df["processed_text"] = df["full_text"].apply(lambda x: dataPreprocessing(x))
    df["text_tokens"] = df["processed_text"].apply(lambda x: word_tokenize(x))
    df["text_length_p"] = df["processed_text"].apply(lambda x: len(x))
    df["word_count_p"] = df["text_tokens"].apply(lambda x: len(x))
    df["unique_word_count_p"] = df["text_tokens"].apply(lambda x: len(set(x)))
    
    # Applying textstat features
    df['textstat_features'] = df['processed_text'].apply(textstat_features)
    textstat_df = pd.DataFrame(df['textstat_features'].tolist())
    df = pd.concat([df, textstat_df], axis=1)

    df.drop(columns=["processed_text", "text_tokens", "textstat_features"], inplace=True)
    
    return df

def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Loại bỏ xuống dòng
    text = re.sub(r'[^\w\s]', '', text)  # Loại bỏ ký tự đặc biệt
    text = re.sub(r'\xa0', '', text)
    text = text.lower()  # Chuyển thành chữ thường
    return text

In [18]:
df_train = preprocess_df(df_train)
df_train['clean_text'] = df_train['full_text'].apply(clean_text)
df_train = df_train.drop('full_text', axis = 1)

In [19]:
df_train.head(5)

Unnamed: 0,score,word_count,essay_length,sentences_count,paragraph_count,unique_word_count,text_length_p,word_count_p,unique_word_count_p,flesch_reading_ease,...,linsear_write_formula,gunning_fog,text_standard,spache_readability,mcalpine_efg_time,syllablaw,readinle_count,lexicon_count,monosyllabcount,clean_text
0,3,545,2677,13,1,248,2640,539,227,58.69,...,13.0,17.08,12.0,7.2,31.58,53.4,624,489,396,many people have car where they live the thing...
1,3,371,1669,19,9,168,1663,371,152,87.55,...,6.714286,7.48,7.0,3.92,19.57,25.7,398,332,275,i am a scientist at nasa that is discussing th...
2,4,605,3077,24,7,243,3065,605,231,65.15,...,15.5,11.49,12.0,5.12,36.96,32.6,767,550,417,people always wish they had the same technolog...
3,4,511,2701,23,9,241,2674,502,223,58.62,...,15.75,11.85,13.0,5.32,32.74,28.9,678,441,284,we all heard about venus the planet without al...
4,3,418,2208,15,11,156,2184,417,148,54.76,...,19.666667,12.61,13.0,5.61,26.62,35.6,561,372,240,dear state senator this is a letter to argue ...


In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   score                         17307 non-null  int64  
 1   word_count                    17307 non-null  int64  
 2   essay_length                  17307 non-null  int64  
 3   sentences_count               17307 non-null  int64  
 4   paragraph_count               17307 non-null  int64  
 5   unique_word_count             17307 non-null  int64  
 6   text_length_p                 17307 non-null  int64  
 7   word_count_p                  17307 non-null  int64  
 8   unique_word_count_p           17307 non-null  int64  
 9   flesch_reading_ease           17307 non-null  float64
 10  flesch_kincaid_grade          17307 non-null  float64
 11  smog_index                    17307 non-null  float64
 12  coleman_liau_index            17307 non-null  float64
 13  a

In [21]:
# rows_to_drop = df_train.query('sentences_count > 60 | (sentences_count > 50 & score == 1)').index

# # Xóa các dòng có index tương ứng

# df_train.drop(rows_to_drop, inplace=True)

In [22]:
# rows_to_drop = df_train.query('paragraph_count > 80 | (paragraph_count > 55 & score == 1 ) | (paragraph_count >60 & score == 2 ) | (paragraph_count > 30  & score == 5) | (paragraph_count > 25  & score == 6)').index

# # Xóa các dòng có index tương ứng

# df_train.drop(rows_to_drop, inplace=True)

In [23]:
# Decomment if want to use Pipeline again
print("Start running...")
# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

# Add text vectorization step
text_vectorizer = TfidfVectorizer(
    encoding='utf-8',
    ngram_range=(1, 3),
    strip_accents='unicode',
    analyzer='word',
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True
)
text_transformer = Pipeline(steps=[
    ('vectorizer', text_vectorizer)
])

# Update categorical and numerical columns
numerical_columns = df_train.select_dtypes('int64').columns
categorical_columns = df_train.select_dtypes('object').columns

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('score')

# Combine transformers using ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('text', text_transformer, 'clean_text')  # Include the 'clean_text' column
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Apply the pipeline to your dataset
X = df_train.drop('score', axis=1)
y = np.log(df_train['score'])
X_preprocessed = pipeline.fit_transform(X)
print("This Pipeline is done")

Start running...
This Pipeline is done


In [24]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, 
                                                    test_size=0.2, random_state=42)

In [25]:
# # idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_true = y_true + a
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk

# metric and objective based on public notebooks
def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess

a = 2.998
b = 1.092

In [26]:
# f1_scores = []
# kappa_scores = []
# predictions = []
# # Define the models
# models = {
#     'XGBoost': XGBRegressor(random_state=42)
# }

# # Define the hyperparameter grids for each model
# param_grids = {
#     'XGBoost': {
#         'n_estimators': [1024],
#         'learning_rate': [0.1],
#         'max_depth': [8],
#         'subsample': [0.5],
# #         'colsample_bytree': [0.5]
#     }
# }

# # 3-fold cross-validation
# cv = KFold(n_splits=3, shuffle=True, random_state=42)

# # Train and tune the models
# grids = {}
# for model_name, model in models.items():
#     grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], 
#                                cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
#     grids[model_name].fit(X_train, y_train, eval_metric=quadratic_weighted_kappa)
#     best_params = grids[model_name].best_params_
#     best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
#     print(f'Best parameters for {model_name}: {best_params}')
#     print(f'Best RMSE for {model_name}: {best_score}\n')

In [27]:
print("Start running...")

# # Define the hyperparameter grids for each model
# param_grids = {
#     "n_estimators": randint(100, 500),
#     "learning_rate": round(np.random.random(),1),
#     "max_depth": randint(1, 9),
#     "subsample": round(np.random.random(),1),
#     "max_features": randint(1, 9),
# }

xgb_callbacks = [
    xgb.callback.EvaluationMonitor(period=25),
    xgb.callback.EarlyStopping(75, metric_name="QWK", maximize=True, save_best=True)
]

xgb_regressor = xgb.XGBRegressor(
    objective=qwk_obj,  # Use custom QWK objective function
    n_estimators= 207,
    learning_rate= 0.10704240620854825,
    min_split_loss = 1,
    max_depth= 6,
    subsample= 0.6389081081835488,
    max_bin = 337,
    random_state=42,
    num_leaves = 10,
    extra_trees=True,
    class_weight='balanced',
    tree_method="hist"
)

# Train the model
xgb_regressor.fit(X_train, y_train)
#                 eval_set=[(X_train, y_train), (X_test, y_test)],
#                 eval_metric=quadratic_weighted_kappa,
#                 callbacks=xgb_callbacks)


# # Make predictions on the validation set
# y_test_pred = xgb_regressor.predict(X_test)

# # Convert predictions back to the original scale
# # y_test_pred_original = np.exp(y_test_pred)
# y_test = y_test + a
# y_test_pred = (y_test_pred + a).clip(1, 6).round()


# # Calculate QWK on the validation set
# qwk_score = cohen_kappa_score(y_test, y_test_pred_original.round(), weights="quadratic")
# print(f"Validation QWK Score: {qwk_score:.4f}")

# score = quadratic_weighted_kappa(y_test, y_pred)
# print(f"Train QWK: {score}")

Start running...


In [28]:
y_pred_log = xgb_regressor.predict(X_test)
print(y_test)
y_pred = np.exp(y_pred_log)
print(y_pred.round())

12696    1.098612
4625     1.098612
733      1.098612
16885    1.098612
3334     1.386294
           ...   
16145    0.000000
4229     0.693147
4313     0.693147
934      0.693147
5058     0.693147
Name: score, Length: 3462, dtype: float64
[4. 4. 2. ... 3. 2. 2.]


In [29]:
# Make predictions on the validation set
y_pred_log = xgb_regressor.predict(X_test)
y_pred = np.exp(y_pred_log)
# y_pred = y_pred + a
y_pred = y_pred.clip(1, 6).round()
y_test_exp = np.exp(y_test)

# Calculate the Cohen's kappa score
score = cohen_kappa_score(y_test_exp.round(), y_pred, weights='quadratic')
print(f"Train QWK: {score}")

Train QWK: 0.7725529001484701


In [30]:
# def objective(trial):
#     param_grid = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 300),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'min_split_loss': trial.suggest_int('min_split_loss', 0, 9),
#         'max_depth': trial.suggest_int('max_depth', 6, 12),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'max_bin': trial.suggest_int('max_bin', 256, 512)
#     }

#     xgb_regressor = xgb.XGBRegressor(
#         objective=qwk_obj,
#         random_state=42,
#         tree_method="hist",
#         **param_grid
#     )
    
#     xgb_regressor.fit(X_train, y_train)
    
#     y_pred_log = xgb_regressor.predict(X_test)
#     y_pred = np.exp(y_pred_log)
#     # y_pred = y_pred + a
#     y_pred = y_pred.clip(1, 6).round()
#     y_test_exp = np.exp(y_test)

#     # Calculate the Cohen's kappa score
#     score = cohen_kappa_score(y_test_exp.round(), y_pred, weights='quadratic')
#     return np.mean(score)

In [31]:
# # Set up the Optuna study
# study = optuna.create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=40, n_jobs=-1)

In [32]:
# # Get the best hyperparameters
# best_params = study.best_params
# print("Best Hyperparameters:", best_params)

In [33]:
# # Train the final model with the best hyperparameters
# xgb_regressor = xgb.XGBRegressor(
#     objective=qwk_obj,
#     random_state=42,
#     **best_params
# )

# xgb_regressor.fit(
#     X_train, y_train,
#     early_stopping_rounds=10,
#     eval_set=[(X_test, y_test)],
#     verbose=True
# )

# y_pred_log = xgb_regressor.predict(X_test)
# y_pred = np.exp(y_pred_log)
# # y_pred = y_pred + a
# y_pred = y_pred.clip(1, 6).round()
# y_test_exp = np.exp(y_test)

# # Calculate the Cohen's kappa score
# score = cohen_kappa_score(y_test_exp.round(), y_pred, weights='quadratic')
# print(f"Train QWK: {score}")

In [34]:
# df_test = pd.read_csv('/kaggle/input/test-processed-csv/test_processed.csv')
# df_test

df_test = preprocess_df(df_test)
df_test['clean_text'] = df_test['full_text'].apply(clean_text)
df_test = df_test.drop('full_text', axis = 1)
df_test.head()

Unnamed: 0,essay_id,word_count,essay_length,sentences_count,paragraph_count,unique_word_count,text_length_p,word_count_p,unique_word_count_p,flesch_reading_ease,...,linsear_write_formula,gunning_fog,text_standard,spache_readability,mcalpine_efg_time,syllablaw,readinle_count,lexicon_count,monosyllabcount,clean_text
0,000d118,545,2677,13,1,248,2640,539,227,58.69,...,13.0,17.08,12.0,7.2,31.58,53.4,624,489,396,many people have car where they live the thing...
1,000fe60,371,1669,19,9,168,1663,371,152,87.55,...,6.714286,7.48,7.0,3.92,19.57,25.7,398,332,275,i am a scientist at nasa that is discussing th...
2,001ab80,605,3077,24,7,243,3065,605,231,65.15,...,15.5,11.49,12.0,5.12,36.96,32.6,767,550,417,people always wish they had the same technolog...


In [35]:
# Lưu lại cột essay_id để sử dụng sau này
essay_ids = df_test['essay_id']

# Xóa cột essay_id trước khi tiền xử lý
X_test = df_test.drop('essay_id', axis=1)

# Áp dụng pipeline đã huấn luyện để tiền xử lý dữ liệu test
X_test_preprocessed = pipeline.transform(X_test)

# Dự đoán điểm số trên dữ liệu test đã tiền xử lý
y_pred_log = xgb_regressor.predict(X_test_preprocessed)

# Chuyển đổi ngược từ log-transform
y_pred = np.exp(y_pred_log)

# Tạo dataframe chứa essay_id và dự đoán điểm số
results = pd.DataFrame({'essay_id': essay_ids, 'score': y_pred})

# Làm tròn điểm số
results['score'] = round(results['score'])
results['score'] = results['score'].astype(int)

# Lưu kết quả ra file csv
submission = results.to_csv('submission.csv', index=False)

In [36]:
results

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4
