In [None]:
import pandas as pd
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, cohen_kappa_score, ConfusionMatrixDisplay
import nltk
from nltk.corpus import stopwords
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from textstat.textstat import textstatistics
from nltk.tokenize import word_tokenize
import torch
from transformers import BertTokenizer, BertModel
import re
import pickle
import optuna
from sklearn.metrics import cohen_kappa_score

In [None]:
# Download stopwords
nltk.download('stopwords')

columns_to_read = ['full_text', 'score']

# Load data
dataset = pd.read_csv('train.csv', usecols=columns_to_read)

# Training XGBoost Algorithm:

In [None]:
def train_xgb(X, y):

    #Data Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Convert to DMatrix, the data structure used by XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set XGBoost parameters for classification
    params = {
        'max_depth': 6,
        'eta': 0.3,
        'objective': 'multi:softmax',  # For classification
        'num_class': len(y.unique())  # Number of classes
    }
    num_rounds = 100

    # Train the model
    bst = xgb.train(params, dtrain, num_rounds)

    # Predict
    y_pred = bst.predict(dtest)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    print(f'Accuracy: {accuracy}')

    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.arange(len(dataset['score'].unique())))

    # Plot confusion matrix
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Quadratic Weighted Kappa (QWK): {qwk:.2f}")

# Preprocessing text:

In [None]:
def preprocess_text(text):
    tokens = gensim.utils.simple_preprocess(text)
    return tokens

In [None]:
def clean_text(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [None]:
# Apply preprocessing
dataset['tokens'] = dataset['full_text'].apply(preprocess_text)

# Clean the text
dataset['clean_text'] = dataset['full_text'].apply(clean_text)

# Convert scores to zero-indexed categorical labels (assuming scores are integers starting from 1)
dataset['score'] = dataset['score'] - 1

# Doc2Vec Model:

In [None]:
# Prepare data for doc2vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset['tokens'])]

# Train doc2vec model
doc2vec_model = Doc2Vec(documents, vector_size=200, window=5, min_count=2, workers=4, epochs=40)

# Get vectors for each essay
dataset['doc2vec_vectors'] = dataset['tokens'].apply(lambda x: doc2vec_model.infer_vector(x))


In [None]:
dataset['doc2vec_vectors'][0]

In [None]:
len(dataset['doc2vec_vectors'][0])

In [None]:
# Split data
X = list(dataset['doc2vec_vectors'])
y = dataset['score']
train_xgb(X, y)

# Word2Vec Model:

In [None]:
word_2_vec = Word2Vec(dataset['tokens'], sg=1)

# Create Document Embeddings
def document_embedding(tokens, model):
    # Filter tokens that are in the vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[valid_tokens], axis=0)

dataset['word2vec_embedding'] = dataset['tokens'].apply(lambda x: document_embedding(x, word_2_vec))


In [None]:
word_2_vec.wv['time']

In [None]:
word_2_vec.wv.most_similar('time')

In [None]:
word_2_vec.wv.most_similar('technology')

In [None]:
X = list(dataset['word2vec_embedding'])
y = dataset['score']
train_xgb(X, y)

In [None]:
dataset

# LDA Model:

In [None]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(dataset['tokens'])
corpus = [dictionary.doc2bow(text) for text in dataset['tokens']]

# Train LDA model
num_topics = 500  # Adjust the number of topics as needed
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Function to get topic distribution for a document
def get_topic_distribution(lda_model, corpus, num_topics):
    topics = lda_model.get_document_topics(corpus, minimum_probability=0)
    topic_matrix = np.zeros((len(corpus), num_topics))
    for i, topic_dist in enumerate(topics):
        for topic, prob in topic_dist:
            topic_matrix[i][topic] = prob
    return topic_matrix

# Generate document-topic vectors
topic_vectors = get_topic_distribution(lda_model, corpus, num_topics)
dataset['lda_vectors'] = list(topic_vectors)

# Ensure all document vectors are valid (remove any NaN vectors)
dataset = dataset.dropna(subset=['lda_vectors'])


In [None]:
dataset['lda_vectors'][0]

In [None]:
len(dataset['lda_vectors'][100])

In [None]:
X = np.array(list(dataset['lda_vectors']))
y = dataset['score']
train_xgb(X, y)

# BERT Model:

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(text):
    encoded_batch = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**encoded_batch)
        batch_embeddings = output.pooler_output
    return batch_embeddings.cpu().numpy()
dataset['bert_vectors'] = dataset.full_text.apply(get_bert_embeddings)

In [None]:
dataset

In [None]:
bert_embeddings = np.vstack(dataset['bert_vectors'].values)

In [None]:
len(bert_embeddings[0])

In [None]:
X = pd.DataFrame(bert_embeddings)
y = dataset['score']
train_xgb(X, y)


## TF-IDF:

In [None]:
# Generate TF-IDF embeddings for the cleaned_text column
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
tfidf_embeddings = tfidf_vectorizer.fit_transform(dataset['clean_text'])


In [None]:
# Use the sparse matrix directly
X = tfidf_embeddings
y = dataset['score']
train_xgb(X, y)

In [None]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(dataset['tokens'])
corpus = [dictionary.doc2bow(text) for text in dataset['tokens']]

# Function to train LDA model and return QWK score
def objective(trial):
    num_topics = trial.suggest_int('num_topics', 50, 300)
    passes = trial.suggest_int('passes', 10, 30)
    iterations = trial.suggest_int('iterations', 50, 300)
    alpha = trial.suggest_categorical('alpha', ['symmetric', 'asymmetric', 'auto'])
    eta = trial.suggest_categorical('eta', ['symmetric', 'auto'])
    chunksize = trial.suggest_int('chunksize', 500, 5000)

    lda_model = LdaModel(
        corpus=corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=passes,
        iterations=iterations,
        alpha=alpha,
        eta=eta,
        chunksize=chunksize,
        random_state=42
    )

    # Generate document-topic vectors
    topic_vectors = np.array([lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus])
    topic_vectors = np.array([[prob for _, prob in doc] for doc in topic_vectors])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(topic_vectors, dataset['score'], test_size=0.2, random_state=42)

    # Train a classifier on the LDA topic vectors using XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    param = {
        'objective': 'multi:softmax',
        'num_class': len(set(dataset['score'])),
        'eval_metric': 'mlogloss',
        'tree_method': 'gpu_hist',  # Use GPU for training
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True)
    }

    bst = xgb.train(param, dtrain, num_boost_round=100)

    # Predict on the test set
    y_pred = bst.predict(dtest)

    # Calculate QWK score
    qwk_score = cohen_kappa_score(y_test, y_pred, weights='quadratic')

    return qwk_score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train the final LDA model with the best hyperparameters
lda_model = LdaModel(
    corpus=corpus,
    num_topics=best_params['num_topics'],
    id2word=dictionary,
    passes=best_params['passes'],
    iterations=best_params['iterations'],
    alpha=best_params['alpha'],
    eta=best_params['eta'],
    chunksize=best_params['chunksize'],
    random_state=42
)

# Function to get topic distribution for a document
def get_topic_distribution(lda_model, corpus, num_topics):
    topics = [lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus]
    topic_matrix = np.zeros((len(corpus), num_topics))
    for i, topic_dist in enumerate(topics):
        for topic, prob in topic_dist:
            topic_matrix[i][topic] = prob
    return topic_matrix

# Generate document-topic vectors
num_topics = best_params['num_topics']
topic_vectors = get_topic_distribution(lda_model, corpus, num_topics)
dataset['lda_vectors'] = list(topic_vectors)

# Ensure all document vectors are valid (remove any NaN vectors)
dataset = dataset.dropna(subset=['lda_vectors'])


In [None]:
pip install xgboost

In [None]:
import pandas as pd
import gensim
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, cohen_kappa_score, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download stopwords and lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
dataset = pd.read_csv('trains.csv', encoding='latin1')

# Explore the dataset
print(dataset.head())
print(dataset.info())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetical characters and tokenize the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = gensim.utils.simple_preprocess(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

dataset['tokens'] = dataset['full_text'].apply(preprocess_text)

# Convert scores to zero-indexed categorical labels (assuming scores are integers starting from 1)
dataset['score'] = dataset['score'] - 1

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(dataset['tokens'])
corpus = [dictionary.doc2bow(text) for text in dataset['tokens']]

# Train LDA model with more topics and passes
num_topics = 50  # Increase the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)  # Increase the number of passes




# Function to get topic distribution for a document
def get_topic_distribution(lda_model, corpus, num_topics):
    topics = lda_model.get_document_topics(corpus, minimum_probability=0)
    topic_matrix = np.zeros((len(corpus), num_topics))
    for i, topic_dist in enumerate(topics):
        for topic, prob in topic_dist:
            topic_matrix[i][topic] = prob
    return topic_matrix

# Generate document-topic vectors
topic_vectors = get_topic_distribution(lda_model, corpus, num_topics)
dataset['lda_vectors'] = list(topic_vectors)

# Ensure all document vectors are valid (remove any NaN vectors)
dataset = dataset.dropna(subset=['lda_vectors'])

# TF-IDF Vectorizer with bigrams
tfidf = TfidfVectorizer(stop_words=list(stop_words), ngram_range=(1, 2))
tfidf_vectors = tfidf.fit_transform(dataset['full_text'])

# Combine LDA and TF-IDF vectors
combined_vectors = np.hstack([list(dataset['lda_vectors']), tfidf_vectors.toarray()])

X = combined_vectors
y = dataset['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Set the hyperparameters manually with more tuning
params = {
    'max_depth': 10,
    'eta': 0.01,
    'objective': 'multi:softmax',
    'num_class': len(y.unique()),
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'lambda': 1.5,
    'alpha': 0.5,
    'n_estimators': 500  # Increase number of boosting rounds
}

# Train the model using GridSearchCV
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(y.unique()), use_label_encoder=False)
parameters = {
    'max_depth': [6, 8, 10],
    'eta': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.5, 1],
    'n_estimators': [100, 200, 500]
}

clf = GridSearchCV(xgb_model, parameters, scoring='accuracy', n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

print("Best parameters:", clf.best_params_)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.arange(len(dataset['score'].unique())))

# Plot confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Quadratic Weighted Kappa (QWK): {qwk:.2f}')