This notebook is designed to explore the language reading ability of individual users. From the user's perspective, it explores the user's ability to read 1, 2, 3, 4, and 5 languages respectively. The principle is simply that if the username appears in articles in five languages, it is considered that he has the reading ability in five languages.

# 1. First expand all target files

In [1]:
import pandas as pd
import glob
import os

# Initialize an empty dictionary to store DataFrames in different languages
lits = {}

for fn in sorted(glob.glob('/Users/josiechen/desktop/prepared data/*.csv')):
  
    if 'anglo-norman' in fn:
        continue
    
  
    df = pd.read_csv(fn)
    
  
    lang = os.path.basename(fn).replace('prepared data_', '').replace('.csv', '').lower()
    
   
    lits[lang] = df[['username', 'language', 'work_id']]


for lang, data in lits.items():
    print(f"Language: {lang}")
    print(data.head())


Language: bahasa_indonesia
          username          language   work_id
0   dianthus_peony  Bahasa Indonesia  36657928
1        Reyan3779  Bahasa Indonesia  36657928
2       CuddlyWorm  Bahasa Indonesia  36657928
3  jinkook_married  Bahasa Indonesia  36657928
4      agustDRkive  Bahasa Indonesia  36657928
Language: português_brasileiro
    username              language   work_id
0   nic_ckie  Português brasileiro  35195854
1   KimIsa15  Português brasileiro  32475397
2  velezhard  Português brasileiro  32475397
3   bunny445  Português brasileiro  32475397
4    Lunnary  Português brasileiro  32475397
Language: chinese
            username language   work_id
0           xiaowwai  chinese  40244955
1         kwonorange  chinese  40244955
2              EM_WX  chinese  40244955
3  mintchocolatemint  chinese  40244955
4      aaaaaaa_aaaaa  chinese  40244955
Language: english
        username language   work_id
0      camote_24  english  39739578
1       AdaoraKi  english  39739578
2  end

# 2. Merge all data together

In [2]:
import pandas as pd
import glob
import os


all_data = []


for fn in sorted(glob.glob('/Users/josiechen/desktop/prepared data/*.csv')):
    
    if 'anglo-norman' in fn:
        continue
    
   
    df = pd.read_csv(fn)
    
  
    all_data.append(df[['username', 'language', 'work_id']])


combined_df = pd.concat(all_data, ignore_index=True)


sorted_df = combined_df.sort_values(by='username')


sorted_df.to_csv('/Users/josiechen/desktop/merged_data.csv', index=False)

print("Merged and sorted data saved to /Users/josiechen/desktop/merged_data.csv")


Merged and sorted data saved to /Users/josiechen/desktop/merged_data.csv


# 3. Count how many users can read articles in 1, 2, 3, 4, or 5 languages

In [3]:
import pandas as pd
import glob
import os


all_data = []


for fn in sorted(glob.glob('/Users/josiechen/desktop/prepared data/*.csv')):
    
    if 'anglo-norman' in fn:
        continue
    
  
    df = pd.read_csv(fn)
    
   
    all_data.append(df[['username', 'language', 'work_id']])


combined_df = pd.concat(all_data, ignore_index=True)


user_lang_counts = combined_df.groupby('username')['language'].nunique()


lang_reading_stats = user_lang_counts.value_counts().sort_index()


for num_languages, num_users in lang_reading_stats.items():
    print(f"Number of users who can read {num_languages} language(s): {num_users}")


Number of users who can read 1 language(s): 34566
Number of users who can read 2 language(s): 1437
Number of users who can read 3 language(s): 155
Number of users who can read 4 language(s): 18
Number of users who can read 5 language(s): 3


It turns out that 23,566 readers can read 1 language, 1,437 readers can read 2 languages, 155 and 18 readers can read 3 and 4 languages, and finally only 3 readers can read all 5 languages.

In [5]:
import pandas as pd
import glob
import os


all_data = []


for fn in sorted(glob.glob('/Users/josiechen/desktop/prepared data/*.csv')):
    
    if 'anglo-norman' in fn:
        continue
    
    
    df = pd.read_csv(fn)
    
    
    all_data.append(df[['username', 'language', 'work_id']])


combined_df = pd.concat(all_data, ignore_index=True)


user_lang_counts = combined_df.groupby('username')['language'].nunique()


users_with_5_languages = user_lang_counts[user_lang_counts == 5].index


print("Users who can read 5 languages:")
for user in users_with_5_languages:
    print(user)


users_with_5_languages.to_series().to_csv('/Users/josiechen/desktop/users_with_5_languages.csv', index=False)


Users who can read 5 languages:
No_Blon
k0om4to
zaurelie371986


Verify the results and find the names of readers who can read 5 languages to double check and ensure the accuracy of the data.