In [None]:
import pandas as pd
import numpy as np

In [None]:
# Import the necessary module
from google.colab import drive
from datasets import load_from_disk

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Check if Google Drive is mounted (optional, but good for debugging)
!ls /content/drive/MyDrive/

# List the contents of the specific directory (optional, but good for debugging)
!ls /content/drive/MyDrive/my_dataset/

# Load the dataset from disk *after* mounting Google Drive
dataset = load_from_disk('file:///content/drive/MyDrive/my_dataset')

In [None]:
train_ds = pd.DataFrame(dataset['train'])
test_ds = pd.DataFrame(dataset['test'])

In [None]:
train_ds

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Term Frequency-Inverse Document Frequency (TF-IDF)
# Step 1: join tokens into strings
train_docs_joined = train_ds['document_lemmatized_n'].apply(lambda tokens: ' '.join(tokens))
test_docs_joined = test_ds['document_lemmatized_n'].apply(lambda tokens: ' '.join(tokens))

# Step 2: fit the vectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_docs_joined)

# Optional: print vocabulary
print('Vocabulary:', vectorizer.vocabulary_)

# Step 3: transform the joined strings
vector_train = vectorizer.transform(train_docs_joined)
vector_test = vectorizer.transform(test_docs_joined)

In [None]:
# get the vocabulary size
vocab_size = len(vectorizer.vocabulary_)
print("Vocabulary size:", vocab_size)

In [None]:
# checking the number of docs in the dataset
num_docs = len(train_ds['document'])
print("Number of documents:", num_docs)