In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import words

nltk.download("words")

[nltk_data] Downloading package words to /home/alireza/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Word frequency

We assume word difficulty is directly related to word frequency. Word frequency is obtained from words.db (https://github.com/harshnative/words-dataset).

In [2]:
def read_clean_data():
    # Connect to the SQLite database
    conn = sqlite3.connect("corpus/data/words.db")

    # Initialize an empty list to hold DataFrames
    dfs = []

    for i in range(3, 35):
        query = f'SELECT * FROM "{i}"'
        df = pd.read_sql_query(query, conn)
        df.drop("ID_I", axis=1, inplace=True)
        dfs.append(df)

    # Concatenate all DataFrames in the list
    final_df = pd.concat(dfs, ignore_index=True)
    final_df.columns = ["word", "frequency"]

    # convert frequency to int
    final_df["frequency"] = final_df["frequency"].astype(int)

    # Sort the DataFrame by frequency in descending order
    final_df = final_df.sort_values("frequency", ascending=False).reset_index(drop=True)

    # Close the connection
    conn.close()

    return final_df

In [3]:
all_words = read_clean_data()

# print df size
print(f"df size is {all_words.shape}")

# Display the concatenated DataFrame
print(all_words)

df size is (608943, 2)
           word    frequency
0           the  23135936835
1           and  12997704754
2           for   5933354779
3          that   3400041846
4          this   3228476270
...         ...          ...
608938  unsteek            1
608939  unsteck            1
608940  unstate            1
608941  unstain            1
608942  unstaid            1

[608943 rows x 2 columns]


In [4]:
# Create a set of English words for fast lookup
english_words_set = set(words.words())

# Identifying English words
all_words["is_english"] = np.vectorize(lambda x: x.lower() in english_words_set)(
    all_words["word"]
)

Upon inspection, it's clear that the set includes some obscure words. In order to remove them, we join with a set of most common English words (https://github.com/dolph/dictionary).

In [5]:
# list of all popular words
popular = pd.read_csv("corpus/data/popular.txt", header=None, names=["word"])

popular["popular"] = True

print(popular)

           word  popular
0            aa     True
1      aardvark     True
2         aargh     True
3         aback     True
4        abacus     True
...         ...      ...
25317    zoning     True
25318    zonked     True
25319       zoo     True
25320      zoom     True
25321   zooming     True

[25322 rows x 2 columns]


In [9]:
df = all_words[(all_words["is_english"])][:50_000].reset_index(drop=True)

# drop is_english and frequency
df.drop(["is_english", "frequency"], axis=1, inplace=True)

# add index column as a new column called rank
df["rank"] = df.index

# merge popular words with df
df = pd.merge(df, popular, how="left", on="word")
# fill NaN with False
df["popular"] = df["popular"].fillna(False).astype(bool)
print(df)

                word   rank  popular
0                the      0     True
1                and      1     True
2                for      2     True
3               that      3     True
4               this      4     True
...              ...    ...      ...
49995  refractometry  49995    False
49996      reconvert  49996    False
49997   planetesimal  49997    False
49998  paravertebral  49998    False
49999          alula  49999    False

[50000 rows x 3 columns]


  df["popular"] = df["popular"].fillna(False).astype(bool)


# GRE/TOEFL/IELTS

To enrich the vocabulary, we add words from GRE, TOEFL, and IELTS exams obtained from:

https://github.com/surajk95/wordsta/tree/master/app/lists

https://github.com/lzrk/nglsh/

https://github.com/ladrift/toefl


In [10]:
# GRE words
js_files = ['warm-up.js', 'intermediate.js', 'hard.js', 'high-frequency-gre.js']

all_js_dfs = []
for js_file in js_files:
    js_df = pd.read_json(f"corpus/data/{js_file}")
    all_js_dfs.append(js_df)

words_1 = pd.concat(all_js_dfs, ignore_index=True)
words_1 = words_1["word"].unique()


len(words_1)

781

In [11]:
def extract_words(file_path, delimiter):
    words = set()  # Use a set to store unique words
    with open(file_path, "r") as file:
        for line in file:
            if delimiter in line:  # Only process lines containing the delimiter
                word = line.split(delimiter)[0].strip()
                words.add(word)  # Add the word to the set (ensures uniqueness)
    return list(words)  # Convert the set back to a list

In [12]:
file_path = "corpus/data/IELTS-4000.txt"
delimiter = ":"
words_2 = extract_words(file_path, delimiter)
len(words_2)


4324

In [13]:
file_path = "corpus/data/wangyumei-toefl-words.txt"
delimiter = "#"
words_3 = extract_words(file_path, delimiter)
len(words_3)


5136

In [14]:
# combine all words using set

words_1 = set(words_1)
words_2 = set(words_2)
words_3 = set(words_3)

combined_words = list(words_1 | words_2 | words_3)

# convert combined_words to dataframe
exam_words = pd.DataFrame({"word": combined_words})
exam_words['exam'] = True

print(exam_words)

               word  exam
0            racket  True
1           ferment  True
2       pack animal  True
3      respectively  True
4          defecate  True
...             ...   ...
7280         myriad  True
7281        whittle  True
7282           dime  True
7283           edge  True
7284  collaboration  True

[7285 rows x 2 columns]


In [15]:
# merge exam_words with df
df = pd.merge(df, exam_words, how="left", on="word")
# fill NaN with False
df["exam"] = df["exam"].fillna(False)
print(df)

                word   rank  popular   exam
0                the      0     True  False
1                and      1     True  False
2                for      2     True  False
3               that      3     True  False
4               this      4     True  False
...              ...    ...      ...    ...
49995  refractometry  49995    False  False
49996      reconvert  49996    False  False
49997   planetesimal  49997    False  False
49998  paravertebral  49998    False  False
49999          alula  49999    False  False

[50000 rows x 4 columns]


  df["exam"] = df["exam"].fillna(False)


We filter for only popular or exam words, keeping their ranks. We will later use this ranks to represent word difficulty.

In [16]:
df = df[df["exam"] | df["popular"]].drop(["exam", "popular"], axis=1).reset_index(drop=True)

print(df)

           word   rank
0           the      0
1           and      1
2           for      2
3          that      3
4          this      4
...         ...    ...
18279  bigamist  49510
18280   seclude  49613
18281   wangler  49701
18282  shipload  49785
18283  intubate  49820

[18284 rows x 2 columns]


In [17]:
# write column word to csv without header
df.to_csv("corpus/english_words.csv", index=False, header=None)
