# Feature Extraction

In [30]:
import huggingface_hub
import os
from datasets import load_dataset, Dataset

# Hugging Face and Weights & Biases setup
huggingface_username = 'HSLU-AICOMP-LearningAgencyLab'
competition = 'learning-agency-lab-automated-essay-scoring-2'

# Login to Hugging Face and W&B
print("Logging in to Hugging Face Hub and W&B...")
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))
print("Login successful.")

Logging in to Hugging Face Hub and W&B...
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/leonkrug/.cache/huggingface/token
Login successful
Login successful.


In [2]:
# Load the entire dataset from Hugging Face
print("Loading the entire dataset from Hugging Face...")
dataset = load_dataset(f"{huggingface_username}/{competition}")
print("Dataset loaded successfully.")

# Inspect the dataset
print("Inspecting the dataset...")
print(dataset)
print(dataset['train'][0])


Loading the entire dataset from Hugging Face...
Dataset loaded successfully.
Inspecting the dataset...
DatasetDict({
    train: Dataset({
        features: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade'],
        num_rows: 13845
    })
    test: Dataset({
        features: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade'],
        num_rows: 3
    })
    eval: Dataset({
        features: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade'],
        num_rows: 3462
    })
})
{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral Colle

## Linguistic and lexical feature extraction

In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from collections import Counter

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

def extract_linguistic_lexical_features(example):
    # Sentence Count
    sentences = sent_tokenize(example['full_text'])
    sentence_count = len(sentences)
    
    # Average Sentence Length
    avg_sentence_length = len(word_tokenize(example['full_text'])) / sentence_count if sentence_count > 0 else 0

    # POS Tagging
    words = word_tokenize(example['full_text'])
    pos_tags = pos_tag(words)
    pos_counts = Counter(tag for _, tag in pos_tags)

    # Count specific POS tags
    pos_noun_count = sum(pos_counts[tag] for tag in ['NN', 'NNS', 'NNP', 'NNPS'])
    pos_verb_count = sum(pos_counts[tag] for tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    pos_adj_count = sum(pos_counts[tag] for tag in ['JJ', 'JJR', 'JJS'])
    pos_adv_count = sum(pos_counts[tag] for tag in ['RB', 'RBR', 'RBS'])

    example['sentence_count'] = sentence_count
    example['average_sentence_length'] = avg_sentence_length
    example['pos_noun_count'] = pos_noun_count
    example['pos_verb_count'] = pos_verb_count
    example['pos_adj_count'] = pos_adj_count
    example['pos_adv_count'] = pos_adv_count
    
    return example

dataset = dataset.map(extract_linguistic_lexical_features)

print(dataset['train'][0])


{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/leonkrug/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/leonkrug/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## Grammar and Readability

In [4]:
import spacy
import language_tool_python
from textblob import TextBlob

# RUN python -m spacy download en_core_web_sm first to download the model
nlp = spacy.load('en_core_web_sm')
tool = language_tool_python.LanguageTool('en-US')

def extract_error_based_features(example):
    # Grammar Error Count using LanguageTool
    grammar_matches = tool.check(example['full_text'])
    grammar_error_count = len(grammar_matches)
    
    # Syntactic Complexity Calculation with Spacy
    doc = nlp(example['full_text'])
    sentence_depths = [len([token for token in sentence if token.dep_ != 'punct']) for sentence in doc.sents]
    syntactic_complexity = sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0

    # Spelling Mistake Count using TextBlob
    blob = TextBlob(example['full_text'])
    spelling_mistake_count = sum(1 for word in blob.words if word.correct() != word)

    # Error Density 
    word_count = len(blob.words)
    error_density = (grammar_error_count + spelling_mistake_count) / word_count if word_count > 0 else 0

    example['grammar_error_count'] = grammar_error_count
    example['syntactic_complexity'] = syntactic_complexity
    example['spelling_mistake_count'] = spelling_mistake_count
    example['error_density'] = error_density
    
    return example

dataset = dataset.map(extract_error_based_features)
print(dataset['train'][0])



Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## Semantic and content-based features

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
corpus = [text for text in dataset['train']['full_text']]

# TF-IDF Vectorizer
TOP_N_KEYWORDS = 100  
vectorizer = TfidfVectorizer(stop_words=list(stop_words), max_features=TOP_N_KEYWORDS)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Reduce dimensionality of the TF-IDF matrix to 50 dimensions
svd = TruncatedSVD(n_components=50, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_matrix)
tfidf_keywords_vectors = tfidf_reduced.tolist()

# Tokenize and prepare corpus for LDA
tokenized_corpus = [[word for word in word_tokenize(doc.lower()) if word.isalpha() and word not in stop_words] for doc in corpus]
dictionary = Dictionary(tokenized_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]

NUM_TOPICS = 15 
lda_model = LdaModel(bow_corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=10)

# Function to generate topic coherence vector for each essay
def lda_topic_vector(text):
    bow = dictionary.doc2bow([word for word in word_tokenize(text.lower()) if word.isalpha()])
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0)
    return [prob for _, prob in topic_distribution]

# Define high-frequency keywords based on TF-IDF
top_keywords = set(vectorizer.get_feature_names_out())

def keyword_coverage_ratio(text):
    words_in_text = set(word_tokenize(text.lower()))
    coverage_ratio = len(top_keywords & words_in_text) / len(top_keywords)
    return coverage_ratio

def extract_semantic_features(example):
    text = example['full_text']
    response = vectorizer.transform([text])
    tfidf_vector = svd.transform(response)  # 50-dimensional vector

    lda_vector = lda_topic_vector(text)

    coverage_ratio = keyword_coverage_ratio(text)
    
    example['tfidf_keywords_vector'] = tfidf_vector.tolist()[0]
    example['lda_topic_vector'] = lda_vector
    example['keyword_coverage'] = coverage_ratio
    
    return example

# Apply the feature extraction to the dataset
dataset = dataset.map(extract_semantic_features)

# Check results
print(dataset['train'][0])

Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## Stylistic Features

In [7]:
from nltk.corpus import stopwords, wordnet
from textblob import TextBlob
import syllapy

stop_words = set(stopwords.words('english'))

def is_long_word(word):
    """Checks if a word has 3 or more syllables."""
    return syllapy.count(word) >= 3

def is_imagery_word(word):
    """Checks if a word is an imagery word, based on WordNet's synsets."""
    synsets = wordnet.synsets(word)
    if not synsets:
        return False
    imagery_tags = ['noun.artifact', 'noun.object', 'noun.plant', 'noun.animal', 'noun.body']
    return any(tag in str(synset.lexname()) for synset in synsets for tag in imagery_tags)

def extract_stylistic_features(example):
    words = [word for word in word_tokenize(example['full_text'].lower()) if word.isalpha() and word not in stop_words]

    pos_tags = pos_tag(words)
    pronoun_count = sum(1 for _, tag in pos_tags if tag in ['PRP', 'PRP$', 'WP', 'WP$'])
    example['pronoun_usage'] = pronoun_count / len(words) if words else 0

    long_words = [word for word in words if is_long_word(word)]
    imagery_words = [word for word in words if is_imagery_word(word)]

    unique_words = set(words)
    example['unique_word_proportion'] = len(unique_words) / len(words) if words else 0
        # Type-Token Ratio (TTR) via spaCy
    doc = nlp(" ".join(words))
    unique_words = set(token.text for token in doc)
    example['type_token_ratio'] = len(unique_words) / len(words) if words else 0

    example['long_word_proportion'] = len(long_words) / len(words) if words else 0
    example['imagery_word_proportion'] = len(imagery_words) / len(words) if words else 0

    blob = TextBlob(example['full_text'])
    example['positive_sentiment_score'] = blob.sentiment.polarity if blob.sentiment.polarity > 0 else 0
    example['negative_sentiment_score'] = -blob.sentiment.polarity if blob.sentiment.polarity < 0 else 0

    return example

dataset = dataset.map(extract_stylistic_features)
print(dataset['train'][0])


Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## Visual and Descriptive Words

In [8]:
stop_words = set(stopwords.words('english'))

def is_visual_word(word):
    """Checks if a word evokes visual imagery, using WordNet synsets."""
    synsets = wordnet.synsets(word)
    if not synsets:
        return False
    
    visual_tags = ['noun.artifact', 'noun.object', 'noun.plant', 'noun.animal', 'noun.body', 'adj.all']
    return any(tag in str(synset.lexname()) for synset in synsets for tag in visual_tags)

def extract_visual_descriptive_features(example):
    words = [word for word in word_tokenize(example['full_text'].lower()) if word.isalpha() and word not in stop_words]

    visual_words = [word for word in words if is_visual_word(word)]
    unique_visual_words = set(visual_words)

    example['visual_word_proportion'] = len(visual_words) / len(words) if words else 0
    example['unique_visual_word_proportion'] = len(unique_visual_words) / len(words) if words else 0
    example['average_imagery_score'] = len(visual_words) / len(unique_visual_words) if unique_visual_words else 0

    return example

dataset = dataset.map(extract_visual_descriptive_features)
print(dataset['train'][0])


Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## Cohesion and Coherence Features

In [9]:
from sentence_transformers import SentenceTransformer
import spacy
import numpy as np
from nltk.tokenize import sent_tokenize

# RUN python -m spacy download en_core_web_sm first to download the model

nlp = spacy.load("en_core_web_sm")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_cohesion_features(example):
    
    # Discourse Coherence with Dependency Parsing
    doc = nlp(example['full_text'])
    discourse_marker_count = 0
    for token in doc:
        if token.dep_ in {"mark", "cc", "advmod"}:  
            discourse_marker_count += 1

    words = [token.text for token in doc if token.is_alpha]
    example['discourse_marker_count'] = discourse_marker_count / len(words) if words else 0

    # Neural Coherence Score
    sentences = sent_tokenize(example['full_text'])
    if len(sentences) > 1:
        sentence_embeddings = embedding_model.encode(sentences)
        similarities = [
            np.dot(sentence_embeddings[i], sentence_embeddings[i + 1]) / 
            (np.linalg.norm(sentence_embeddings[i]) * np.linalg.norm(sentence_embeddings[i + 1]))
            for i in range(len(sentence_embeddings) - 1)
        ]
        example['neural_coherence_score'] = np.mean(similarities) if similarities else 0
    else:
        example['neural_coherence_score'] = 0  

    return example

dataset = dataset.map(extract_cohesion_features)
print(dataset['train'][0])


Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## NN based features

In [13]:
from transformers import LongformerModel, LongformerTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load Longformer model and tokenizer, supporting sequences up to 4096 tokens
longformer_model = LongformerModel.from_pretrained("allenai/longformer-large-4096")
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096")

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device
longformer_model.to(device)

# Function to get Longformer sentence embeddings
def get_longformer_embedding(text):
    # Tokenize the text and move input tensors to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096).to(device)
    with torch.no_grad():
        outputs = longformer_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move to CPU for numpy compatibility
    return embeddings

# Function to calculate coherence score using cosine similarity between sentence embeddings
def calculate_coherence_score(text):
    sentences = text.split('.')
    sentence_embeddings = []

    # Get Longformer embeddings for each sentence
    for sentence in sentences:
        if sentence.strip():  # Avoid empty sentences
            embedding = get_longformer_embedding(sentence.strip())
            sentence_embeddings.append(embedding)

    # Calculate pairwise cosine similarities between consecutive sentence embeddings
    coherence_scores = [
        cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i + 1]])[0][0]
        for i in range(len(sentence_embeddings) - 1)
    ]

    return np.mean(coherence_scores) if coherence_scores else 0

# Function to extract neural network-based features
def extract_neural_features(example):
    essay_embedding = get_longformer_embedding(example['full_text'])
    example['longformer_sentence_embedding'] = essay_embedding

    example['longformer_coherence_score'] = calculate_coherence_score(example['full_text'])

    return example

# Apply feature extraction to the dataset
dataset = dataset.map(extract_neural_features)
print(dataset['train'][0])

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using device: cuda


Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

## Vocabulary Sophistication 

In [16]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from lexical_diversity import lex_div as ld

# Load spaCy model
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

# Vocabulary Maturity using frequency as a proxy
from nltk.corpus import brown  # Brown corpus for word frequencies
word_frequencies = Counter(brown.words())  # Get frequencies from a standard corpus

def calculate_frequency_score(words):
    """Calculates average frequency of words in a standard corpus, with lower frequencies indicating more sophisticated vocabulary."""
    frequencies = [word_frequencies[word.lower()] for word in words if word.lower() in word_frequencies]
    return sum(frequencies) / len(frequencies) if frequencies else 0

def extract_vocabulary_features(example):
    # Tokenize and filter words
    words = [word for word in word_tokenize(example['full_text'].lower()) if word.isalpha() and word not in stop_words]

    # Type-Token Ratio (TTR) via spaCy
    doc = nlp(" ".join(words))
    unique_words = set(token.text for token in doc)
    example['type_token_ratio'] = len(unique_words) / len(words) if words else 0

    # Lexical Diversity (spacy-based diversity metric)
    example['lexical_diversity'] = doc._.mtld if hasattr(doc._, 'mtld') else ld.mtld(words)  # Ensure spaCy extension or fallback
    
    # Vocabulary Maturity (using word frequency as proxy for rarity/sophistication)
    example['vocabulary_maturity'] = calculate_frequency_score(words)

    return example

dataset = dataset.map(extract_vocabulary_features)
print(dataset['train'][0])


Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

{'essay_id': 'ea26dc4', 'full_text': 'I belive that they should change the elections to elections by popular vote for the president of the United States.\n\nThe electoral College consists of 530 electors, tey choose the president by a vote in congress or "qualified" citizens. A majoriy of 270 electoral votes are required to elect a president, therefore in my opinion a popula vote would have mor meaning, beacause it\'s more people the vote would be chosen fr the mayority making the rest of the people satisfied. The number of citizens all togete versus the number of the electors at the Electoral College is superior; the people are te ones that follow the law might as well let them choose he president thats going to run the country and establish laws.\n\nThe electoral college system prevents us from voting for the presidet directly, instead they make us vote for a slate of electors, who in turn elect the pesident. For example if you were to live in Texas and wanted to vote for a president

# Create the Dataset and push it to HuggingFace

In [31]:
import huggingface_hub
import os

# Display the train, eval, and test splits
print("Train Dataset Preview:")
print(dataset['train'])

print("\nEval Dataset Preview:")
print(dataset['eval'])

print("\nTest Dataset Preview:")
print(dataset['test'])

# Login to Hugging Face Hub using your token (ensure HUGGINGFACE_TOKEN is set in environment)
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))

# Define the repository name, user, and make sure it is set to private
repo_id = f"{huggingface_username}/{competition}_V2"

# Convert the test split to a DataFrame (or dictionary) to redefine it with the correct schema
test_data = dataset['test'].to_pandas()

# Redefine the data types for the fields causing issues
test_data = test_data.astype({
    'negative_sentiment_score': 'float64',
    'neural_coherence_score': 'float64',
    'longformer_coherence_score': 'float64'
})

# Re-convert the DataFrame back to a Hugging Face Dataset with the corrected schema
dataset['test'] = Dataset.from_pandas(test_data)

# Create a new 'score' column with -1 in the 'test' split if 'score' does not already exist
if 'score' not in dataset['test'].column_names:
    dataset['test'] = dataset['test'].add_column('score', [-1] * len(dataset['test']))

# Push each split to the Hub
dataset['train'].push_to_hub(repo_id, private=True, split="train")
dataset['eval'].push_to_hub(repo_id, private=True, split="eval")
dataset['test'].push_to_hub(repo_id, private=True, split="test")

print("All splits have been successfully pushed to Hugging Face Hub.")

Train Dataset Preview:
Dataset({
    features: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade', 'sentence_count', 'average_sentence_length', 'pos_noun_count', 'pos_verb_count', 'pos_adj_count', 'pos_adv_count', 'grammar_error_count', 'syntactic_complexity', 'spelling_mistake_count', 'error_density', 'tfidf_keywords_vector', 'lda_topic_vector', 'keyword_coverage', 'pronoun_usage', 'unique_word_proportion', 'long_word_proportion', 'imagery_word_proportion', 'positive_sentiment_score', 'negative_sentiment_score', 'visual_word_proportion', 'unique_visual_word_proportion', 'average_imagery_score', 'discourse_marker_count', 'neural_coherence_score', 'longformer_sentence_embedding', 'longformer_coherence_score', 'type_token_ratio', 'lexical_diversity', 'vocabulary_maturity'],
    num_rows: 13845
})

Eval Dataset Preview:
Dataset({
    features: ['essay_id', 'full_text', 'score'

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

All splits have been successfully pushed to Hugging Face Hub.
