In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
import re
cmap = plt.cm.get_cmap('coolwarm')
import warnings
warnings.filterwarnings('ignore')


# Use for pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, f1_score

from nltk.tokenize import word_tokenize
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier 
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [None]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [None]:
essay_id_dropped = df_train['essay_id']
df_train = df_train.drop('essay_id', axis = 1)
df_train.info()

In [None]:
df_train.head(5)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    # "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    # "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    # "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    # "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    # "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    # "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    # "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    # "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def remove_punctuation(text):
    """
    Remove all punctuation from the input text.
    
    Args:
    - text (str): The input text.
    
    Returns:
    - str: The text with punctuation removed.
    """
    # string.punctuation
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

def dataPreprocessing_w_contract(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

def dataPreprocessing_w_punct_remove(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = remove_punctuation(x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

def dataPreprocessing_w_contract_punct_remove(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = remove_punctuation(x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [None]:
# Number of words
import string
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df['word_count'] = df['full_text'].apply(lambda x: len(x.split()))

    # Length
    df['essay_length'] = df['full_text'].str.len()

    # Sentences count
    # Adding a new column 'sentences_count' that counts the sentences in 'full_text'
    df['sentences_count'] = df['full_text'].str.count(r'\.')

    # Paragraph count
    # Adding a new column 'paragraph_count' that counts the paragraphs in 'full_text'
    df['paragraph_count'] = df['full_text'].str.count(r'\n') + 1
    
    df["text_tokens"] = df["full_text"].apply(lambda x: word_tokenize(x))
    df["word_count"] = df["text_tokens"].apply(lambda x: len(x))
    df["unique_word_count"] = df["text_tokens"].apply(lambda x: len(set(x)))

#     df["processed_text"] = df["full_text"].apply(lambda x: dataPreprocessing(x))
#     df["text_tokens"] = df["processed_text"].apply(lambda x: word_tokenize(x))
#     df["text_length_p"] = df["processed_text"].apply(lambda x: len(x))
#     df["word_count_p"] = df["text_tokens"].apply(lambda x: len(x))
#     df["unique_word_count_p"] = df["text_tokens"].apply(lambda x: len(set(x)))
    
#     df.drop(columns=["processed_text", "text_tokens"], inplace=True)
    return df

def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Loại bỏ xuống dòng
    text = re.sub(r'[^\w\s]', '', text)  # Loại bỏ ký tự đặc biệt
    text = text.lower()  # Chuyển thành chữ thường
    return text

In [None]:
df_train = preprocess_df(df_train)
df_train['clean_text'] = df_train['full_text'].apply(clean_text)
df_train = df_train.drop('full_text', axis = 1)

In [None]:
df_train.head(5)

In [None]:
df_train.info()

In [None]:
# rows_to_drop = df_train.query('sentences_count > 60 | (sentences_count > 50 & score == 1)').index

# # Xóa các dòng có index tương ứng

# df_train.drop(rows_to_drop, inplace=True)

In [None]:
# rows_to_drop = df_train.query('paragraph_count > 80 | (paragraph_count > 55 & score == 1 ) | (paragraph_count >60 & score == 2 ) | (paragraph_count > 30  & score == 5) | (paragraph_count > 25  & score == 6)').index

# # Xóa các dòng có index tương ứng

# df_train.drop(rows_to_drop, inplace=True)

In [None]:
# Decomment if want to use Pipeline again

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

# Add text vectorization step
text_vectorizer = TfidfVectorizer(
    encoding='utf-8',
    ngram_range=(1, 3),
    strip_accents='unicode',
    analyzer='word',
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True
)
text_transformer = Pipeline(steps=[
    ('vectorizer', text_vectorizer)
])

# Update categorical and numerical columns
numerical_columns = df_train.select_dtypes('int64').columns
categorical_columns = df_train.select_dtypes('object').columns

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('score')

# Combine transformers using ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('text', text_transformer, 'clean_text')  # Include the 'clean_text' column
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Apply the pipeline to your dataset
X = df_train.drop('score', axis=1)
y = np.log(df_train['score']) #normalize dependent variable 
X_preprocessed = pipeline.fit_transform(X)

In [None]:
# vectorizer = TfidfVectorizer(
#     encoding='utf-8',
#     ngram_range=(1, 3),
#     strip_accents='unicode',
#     analyzer='word',
#     min_df=0.05,
#     max_df=0.95,
#     sublinear_tf=True
# )

# train_vectorized = pd.DataFrame(
#     vectorizer.fit_transform(df_train['full_text']).toarray(),
#     columns=[f"tfidf_{str(f)}" for f in vectorizer.get_feature_names_out()],
# )

# train_vectorized.head()

In [None]:
# X = pd.concat([df_train, train_vectorized], axis=1).drop(columns=["full_text", "score", 'clean_text'], axis=1)
# y = df_train["score"]

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, 
                                                    test_size=0.2, random_state=42)
# Use for vectorize
# X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                     test_size=0.2, random_state=42)

In [None]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_true, y_pred):
    if isinstance(y_pred, xgb.QuantileDMatrix):
        # XGB
        y_true, y_pred = y_pred, y_true

        y_true = (y_true.get_label() + a).round()
        y_pred = (y_pred + a).clip(1, 6).round()
        qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
        return 'QWK', qwk

def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess
a = 2.998
b = 1.092

In [None]:
# f1_scores = []
# kappa_scores = []
# predictions = []
# # Define the models
# models = {
#     'XGBoost': XGBRegressor(random_state=42)
# }

# # Define the hyperparameter grids for each model
# param_grids = {
#     'XGBoost': {
#         'n_estimators': [1024],
#         'learning_rate': [0.1],
#         'max_depth': [8],
#         'subsample': [0.5],
# #         'colsample_bytree': [0.5]
#     }
# }

# # 3-fold cross-validation
# cv = KFold(n_splits=3, shuffle=True, random_state=42)

# # Train and tune the models
# grids = {}
# for model_name, model in models.items():
#     grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
#     grids[model_name].fit(X_train, y_train, eval_metric=quadratic_weighted_kappa)
#     best_params = grids[model_name].best_params_
#     best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
#     print(f'Best parameters for {model_name}: {best_params}')
#     print(f'Best RMSE for {model_name}: {best_score}\n')

In [None]:
# # Khởi tạo danh sách để lưu trữ kết quả
# kappa_scores = []
# predictions = []

# # Khởi tạo mô hình với các siêu tham số cố định
# model = XGBRegressor(
#     n_estimators=1024,
#     learning_rate=0.1,
#     max_depth=8,
#     subsample=0.5,
#     random_state=42
# )

# # 3-fold cross-validation
# cv = KFold(n_splits=3, shuffle=True, random_state=42)

# # Huấn luyện và đánh giá mô hình
# for train_index, test_index in cv.split(X_train):
#     X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
#     # Huấn luyện mô hình
#     model.fit(X_train_fold, y_train_fold)
    
#     # Dự đoán trên tập kiểm tra
#     y_pred = model.predict(X_test_fold)
    
#     # Tính toán điểm Kappa
#     kappa = quadratic_weighted_kappa(y_test_fold, y_pred)
#     kappa_scores.append(kappa)
    
#     # Lưu trữ dự đoán
#     predictions.extend(y_pred)
    
#     print(f'Kappa score for this fold: {kappa}')

# # In kết quả cuối cùng
# print(f'Average Kappa score: {np.mean(kappa_scores)}')

In [None]:
print("Start running...")
xgb_regressor = xgb.XGBRegressor(
    objective=qwk_obj,  # Use custom QWK objective function
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

# Train the model
xgb_regressor.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = xgb_regressor.predict(X_test)

# Convert predictions back to the original scale
y_val_pred_original = np.exp(y_val_pred)

# Calculate QWK on the validation set
qwk_score = cohen_kappa_score(y_test.round(), y_val_pred_original.round(), weights="quadratic")
print(f"Validation QWK Score: {qwk_score:.4f}")

In [None]:
# df_test = pd.read_csv('/kaggle/input/test-processed-csv/test_processed.csv')
# df_test

df_test = preprocess_df(df_test)
df_test['clean_text'] = df_test['full_text'].apply(clean_text)
df_test = df_test.drop('full_text', axis = 1)
df_test.head()

In [None]:
# # Lấy mô hình tốt nhất
# best_model = grids['XGBoost'].best_estimator_
# best_model

In [None]:
# Lưu lại cột essay_id để sử dụng sau này
essay_ids = df_test['essay_id']

# Xóa cột essay_id trước khi tiền xử lý
X_test = df_test.drop('essay_id', axis=1)

# Áp dụng pipeline đã huấn luyện để tiền xử lý dữ liệu test
X_test_preprocessed = pipeline.transform(X_test)

# Dự đoán điểm số trên dữ liệu test đã tiền xử lý
y_pred_log = xgb_regressor.predict(X_test_preprocessed)
# Chuyển đổi ngược từ log-transform
y_pred = np.exp(y_pred_log)

# Tạo dataframe chứa essay_id và dự đoán điểm số
results = pd.DataFrame({'essay_id': essay_ids, 'score': y_pred})

# Làm tròn điểm số
results['score'] = round(results['score'])
results['score'] = results['score'].astype(int)

# Lưu kết quả ra file csv
submission = results.to_csv('submission.csv', index=False)

In [None]:
results