In [44]:
import numpy as np
import pandas as pd
import MeCab

# Load the list of words
duo = np.loadtxt("SRC/Duolingo_JP_Wordlist_2024-12-09.txt", encoding="utf-8", dtype="str")
print(duo)

# Initialize MeCab Tagger for word segmentation
wakati = MeCab.Tagger("-Owakati")

# Parse each word individually and collect the results
parsed_words = []
for word in duo:
    # Parse the word and split it into tokens
    parsed = wakati.parse(word).split()
    parsed_words.extend(parsed)

# Convert the list of parsed words to a Pandas DataFrame
df = pd.DataFrame(parsed_words, columns=["Word"])

# Remove duplicates, keeping only the first occurrence
df_unique = df.drop_duplicates(subset=["Word"], keep='first').reset_index(drop=True)

# Define the output file name
output_file = "Duolingo_unique_words.csv"
df_unique.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Duolingo_Unique words saved to {output_file}")

['札幌' '大晦日' '福袋' ... 'すし' 'ください' 'おちゃ']
Duolingo_Unique words saved to Duolingo_unique_words.csv


In [69]:
pfreq = pd.read_csv('Personalized_Frequency_List.csv', encoding="utf-8-sig")
duolist = pd.read_csv('Duolingo_unique_words.csv', encoding="utf-8", header=None, names=["Word"])

# Create an empty list to hold the result (word and line number)
duolingo_frequency = []

# Compare each word in the Duolingo list with the Personalized list
for duolingo_word in duolist["Word"]:
    # Try to find the word in the personalized list
    match = pfreq[pfreq["Word"] == duolingo_word]
    if not match.empty:
        # Get the line number (index + 1 for human-readable index)
        line_number = match.index[0] + 1  # Adding 1 to make the line number 1-based
        count = match["Total Count"].values[0]  # Get the count of the word from the Personalized list (not 'Total Count')
        duolingo_frequency.append([duolingo_word, line_number, count])
    else:
        # If no match, assign None or some other placeholder
        duolingo_frequency.append([duolingo_word, None, None])

# Create a new DataFrame from the list
duolingo_frequency_df = pd.DataFrame(duolingo_frequency, columns=["Word", "Line_Number", "Count"])

# Convert "Line" and "Count" columns to integers (where applicable)
duolingo_frequency_df["Line_Number"] = duolingo_frequency_df["Line_Number"].astype('Int64')  # 'Int64' allows None values
duolingo_frequency_df["Count"] = duolingo_frequency_df["Count"].astype('Int64')  # Convert to integer

df_sorted = duolingo_frequency_df.sort_values(
    by=["Line_Number"], 
    ascending=[True],  # Ascending for "Line_Number" and descending for "Count"
    na_position="last"  # Place None/NA values at the end
).reset_index(drop=True)

df_sorted.to_csv("DuolingoFrequency.csv", index=False, encoding="utf-8-sig")
print(df_sorted.head(50))


   Word  Line_Number  Count
0     の            1  19286
1     に            2  16022
2     て            3  14089
3     を            4  12896
4     は            5  12197
5     た            6  11829
6     が            7  10464
7     と            8   9878
8     で            9   8338
9     し           10   5963
10    も           11   5349
11    な           12   4287
12    だ           13   4017
13   ない           14   3743
14    か           15   3401
15   から           17   2642
16    い           19   2237
17    ん           20   2155
18   よう           22   2013
19   それ           26   1493
20    さ           27   1485
21   その           29   1429
22    よ           31   1277
23    的           32   1176
24   ます           33   1141
25   もの           36   1093
26    や           37   1059
27    何           38   1001
28    人           39    970
29   この           40    954
30   たち           41    937
31    ね           42    926
32    お           43    898
33    ば           44    879
34    僕           47