# **Creating Vocabulary**

We are using the COCA samples

In [1]:
import pandas as pd
import numpy as np

In [6]:
import os
import re
from collections import Counter

# Define the path to your text files
directory = '../data/corpus/coca-samples-text'

# Initialize a counter for the vocabulary
vocabulary = Counter()

# This regex matches only alphabetic sequences (i.e., words)
word_pattern = re.compile(r'\b[a-zA-Z]+\b')

# Read and process each file
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            # Normalize the text
            text = text.lower()  # Convert to lowercase
            # Find all valid words
            words = word_pattern.findall(text)
            
            # Update the vocabulary counter with words
            vocabulary.update(words)

# Print the most common words
print(vocabulary.most_common(100))  # Print the 100 most common words
print(len(vocabulary))  # Print the number of unique words

[('the', 462883), ('to', 238874), ('and', 232584), ('of', 218668), ('a', 206816), ('in', 154602), ('i', 139623), ('that', 124059), ('you', 109927), ('p', 108290), ('s', 107201), ('it', 104072), ('is', 94201), ('for', 79002), ('on', 65490), ('was', 64461), ('with', 59800), ('he', 57779), ('this', 51981), ('t', 51527), ('as', 51304), ('n', 51142), ('we', 47814), ('are', 47246), ('have', 47011), ('be', 46709), ('not', 44061), ('but', 42634), ('they', 42499), ('at', 42245), ('do', 41723), ('what', 35786), ('from', 34702), ('his', 33609), ('by', 32861), ('or', 32280), ('all', 30252), ('she', 30008), ('my', 29416), ('an', 28691), ('about', 27869), ('so', 27507), ('there', 27373), ('one', 27128), ('her', 26401), ('had', 25676), ('if', 25430), ('me', 24875), ('your', 24687), ('who', 23555), ('can', 23406), ('out', 23357), ('their', 23236), ('no', 23179), ('has', 22791), ('up', 22668), ('were', 22508), ('like', 22124), ('when', 21978), ('just', 21765), ('would', 21669), ('more', 20965), ('will'

In [7]:
# Assuming 'vocabulary' is a Counter or set that contains your vocabulary

# Define the tags you want to check
tags = ['@5018041', '@5108241', '@5108341', '@5108141', '<p>', '!', 'p', '5108141', 'test', 'of', "again", "ok"]  # Example tags

# Check if each tag is in the vocabulary
for tag in tags:
    if tag in vocabulary:
        print(f"Tag '{tag}' is present in the vocabulary.")
        print(f"Frequency of '{tag}': {vocabulary[tag]}")
    else:
        print(f"Tag '{tag}' is NOT present in the vocabulary.")


Tag '@5018041' is NOT present in the vocabulary.
Tag '@5108241' is NOT present in the vocabulary.
Tag '@5108341' is NOT present in the vocabulary.
Tag '@5108141' is NOT present in the vocabulary.
Tag '<p>' is NOT present in the vocabulary.
Tag '!' is NOT present in the vocabulary.
Tag 'p' is present in the vocabulary.
Frequency of 'p': 108290
Tag '5108141' is NOT present in the vocabulary.
Tag 'test' is present in the vocabulary.
Frequency of 'test': 1204
Tag 'of' is present in the vocabulary.
Frequency of 'of': 218668
Tag 'again' is present in the vocabulary.
Frequency of 'again': 4991
Tag 'ok' is present in the vocabulary.
Frequency of 'ok': 1476


In [8]:
for word in vocabulary:
    if word.startswith('@'):
        print(word)  # Print the word starting with '@'
    if not word.isalpha():
        print(word)

In [9]:
# Valid single-letter words
valid_single_letter_words = {'a', 'i'}
valid_two_letter_words = {'am', 'an', 'as', 'at', 'ax', 'be', 'by', 'do', 'go', 'he', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'ok', 'on', 'or', 'ox', 'so', 'to', 'up', 'us', 'we'}

# Function to filter vocabulary
def filter_vocabulary(vocabulary):
    filtered_vocab = Counter()
    
    for word, count in vocabulary.items():
        if (len(word) == 1 and word not in valid_single_letter_words) or (len(word) == 2 and word not in valid_two_letter_words):
            continue  # Skip this word
        filtered_vocab[word] = count
    
    return filtered_vocab   

# Filter the vocabulary
filtered_vocabulary = filter_vocabulary(vocabulary)

# Print the most common words after filtering
print(filtered_vocabulary.most_common(100))
print(len(filtered_vocabulary))

[('the', 462883), ('to', 238874), ('and', 232584), ('of', 218668), ('a', 206816), ('in', 154602), ('i', 139623), ('that', 124059), ('you', 109927), ('it', 104072), ('is', 94201), ('for', 79002), ('on', 65490), ('was', 64461), ('with', 59800), ('he', 57779), ('this', 51981), ('as', 51304), ('we', 47814), ('are', 47246), ('have', 47011), ('be', 46709), ('not', 44061), ('but', 42634), ('they', 42499), ('at', 42245), ('do', 41723), ('what', 35786), ('from', 34702), ('his', 33609), ('by', 32861), ('or', 32280), ('all', 30252), ('she', 30008), ('my', 29416), ('an', 28691), ('about', 27869), ('so', 27507), ('there', 27373), ('one', 27128), ('her', 26401), ('had', 25676), ('if', 25430), ('me', 24875), ('your', 24687), ('who', 23555), ('can', 23406), ('out', 23357), ('their', 23236), ('no', 23179), ('has', 22791), ('up', 22668), ('were', 22508), ('like', 22124), ('when', 21978), ('just', 21765), ('would', 21669), ('more', 20965), ('will', 20664), ('know', 18882), ('said', 18678), ('did', 17627)

In [10]:
# Function to filter vocabulary
def filter_vocabulary_min_freq(vocabulary, min_count=2):
    filtered_vocab = Counter()
    
    for word, count in vocabulary.items():
        if count < min_count:
            continue  # Skip this word
        filtered_vocab[word] = count
    
    return filtered_vocab

In [11]:
# Filter the vocabulary
filtered_vocabulary_len = filter_vocabulary_min_freq(filtered_vocabulary, min_count=20)

In [12]:
# Get the least common words by reversing the output of most_common()
least_common_words = filtered_vocabulary_len.most_common()[::-1]

# Print the least common words, e.g., the 100 least common words
print(least_common_words[:100])
print(len(least_common_words))

[('tcr', 20), ('cspi', 20), ('guenther', 20), ('barristers', 20), ('korgano', 20), ('angkatell', 20), ('eun', 20), ('niran', 20), ('iphones', 20), ('soulmate', 20), ('stephanopoulo', 20), ('nair', 20), ('vinita', 20), ('outfront', 20), ('kaine', 20), ('karr', 20), ('todays', 20), ('gaylin', 20), ('lac', 20), ('churkin', 20), ('ntsb', 20), ('brigitte', 20), ('ewell', 20), ('estrogen', 20), ('lainey', 20), ('authoraffiliation', 20), ('teesha', 20), ('solicitors', 20), ('dermot', 20), ('kosnik', 20), ('chtarri', 20), ('shatlow', 20), ('shoo', 20), ('guruji', 20), ('rambling', 20), ('threaded', 20), ('dissipation', 20), ('chromosome', 20), ('originalist', 20), ('farnsworth', 20), ('ptl', 20), ('kompetenz', 20), ('mammography', 20), ('hpv', 20), ('illustrators', 20), ('oncol', 20), ('leicester', 20), ('comaroff', 20), ('winked', 20), ('popov', 20), ('catchment', 20), ('cfd', 20), ('bargmann', 20), ('groundfish', 20), ('causative', 20), ('parallelism', 20), ('veronese', 20), ('emulation', 20

In [13]:
print(len(filtered_vocabulary))
print(len(filtered_vocabulary_len))

122022
19953


Now, we also need to add all the words from the phrase tests to be sure we will not have any problems.

In [14]:
with open("../data/corpus/phraseWords.txt", "r") as f:
    phraseWords = f.read().splitlines()

In [15]:
for word in phraseWords:
    if not word in filtered_vocabulary_len:
        # Add it to the vocab with the frequency of min_count
        filtered_vocabulary_len[word] = 20

In [16]:
print(len(filtered_vocabulary_len))

20015


## **Creating a Dataframe**

In [17]:
df_vocab = pd.DataFrame(filtered_vocabulary_len.items(), columns=['word', 'count'])

In [18]:
df_vocab.head()

Unnamed: 0,word,count
0,iran,515
1,nuclear,613
2,program,2389
3,talks,338
4,photo,1387


In [19]:
# Add a column log_count to the DataFrame
df_vocab['log_count'] = df_vocab['count'].apply(lambda x: np.log(x))

# Show the top 10 frequencies and log frequencies
print(df_vocab.head(10))

      word   count  log_count
0     iran     515   6.244167
1  nuclear     613   6.418365
2  program    2389   7.778630
3    talks     338   5.823046
4    photo    1387   7.234898
5  updated     144   4.969813
6      nov     302   5.710427
7       is   94201  11.453186
8      one   27128  10.208322
9       of  218668  12.295310


In [20]:
# Save it as vocab_final.csv
df_vocab.to_csv('../data/vocab_final.csv', index=True)