In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
import pandas as pd
from collections import Counter

In [35]:
lemmatized_content = "/content/drive/MyDrive/NLPCoursework/lemmatized_data.txt"
stemmed_content = '/content/drive/MyDrive/NLPCoursework/stemmed_data.txt'
whitespace_content = '/content/drive/MyDrive/NLPCoursework/whitespace_data.txt'

In [36]:
lemmatized_data = []
stemmed_data = []
whitespace_data = []

In [37]:
with open(lemmatized_content, "r") as f:
    lemmatized_data = f.read().split()

with open(stemmed_content, "r") as f:
    stemmed_data = f.read().split()

with open(whitespace_content, "r") as f:
    whitespace_data = f.read().split()

In [38]:
# Compare vocabulary size

stemmed_vocab_size = len(set(stemmed_content))
lemmatized_vocab_size = len(set(lemmatized_content))
whitespace_vocab_size = len(set(whitespace_content))

In [39]:
print("Stemmed vocabulary size:", stemmed_vocab_size)
print("Lemmatized vocabulary size:", lemmatized_vocab_size)
print("Whitespace vocabulary size:", whitespace_vocab_size)

Stemmed vocabulary size: 26
Lemmatized vocabulary size: 28
Whitespace vocabulary size: 27


In [40]:
# Compare token frequencies

stemmed_freq = Counter(stemmed_content)
lemmatized_freq = Counter(lemmatized_content)
whitespace_freq = Counter(whitespace_content)

In [41]:
# Identify most frequent tokens

most_frequent_stemmed = stemmed_freq.most_common(10)
most_frequent_lemmatized = lemmatized_freq.most_common(10)
most_frequent_whitespace = whitespace_freq.most_common(10)

In [42]:
print("Most frequent stemmed tokens:", most_frequent_stemmed)
print("Most frequent lemmatized tokens:", most_frequent_lemmatized)
print("Most frequent whitespace tokens:", most_frequent_whitespace)

Most frequent stemmed tokens: [('t', 6), ('e', 6), ('/', 5), ('r', 4), ('o', 3), ('d', 3), ('n', 2), ('i', 2), ('v', 2), ('s', 2)]
Most frequent lemmatized tokens: [('t', 6), ('e', 6), ('/', 5), ('r', 4), ('o', 3), ('d', 3), ('i', 3), ('a', 3), ('n', 2), ('v', 2)]
Most frequent whitespace tokens: [('t', 6), ('e', 6), ('/', 5), ('r', 4), ('o', 3), ('i', 3), ('a', 3), ('c', 2), ('n', 2), ('d', 2)]


In [43]:
# Compare average token length

def average_token_length(tokens):
  total_length = sum(len(token) for token in tokens)
  return total_length / len(tokens)

In [44]:
stemmed_avg_length = average_token_length(stemmed_content)
lemmatized_avg_length = average_token_length(lemmatized_content)
whitespace_avg_length = average_token_length(whitespace_content)

In [45]:
print("Stemmed average token length:", stemmed_avg_length)
print("Lemmatized average token length:", lemmatized_avg_length)
print("Whitespace average token length:", whitespace_avg_length)

Stemmed average token length: 1.0
Lemmatized average token length: 1.0
Whitespace average token length: 1.0
