## Corpus Summary

In [134]:
from collections import Counter


def corpus_total_and_unique_tokens(corpus_filename, min_count=10):
    freq = Counter()

    # Avoid loading the whole corpus into memory
    with open(corpus_filename, 'r', encoding='utf-8') as f:
        [freq.update(line.split()) for line in f]

    min_count_dict = {key: val for key, val in freq.items() if val >= min_count}
    return len(min_count_dict.items()), sum(min_count_dict.values())

for century in list(range(100, 1600, 100)):
    filename = "data/compiled/corpus_" + str(century) + "AH_nonstop"
    unique, total = corpus_total_and_unique_tokens(filename)
    print(century, unique, total)

100 11134 422928
200 65185 8316454
300 167853 48082376
400 249809 77664268
500 255145 87616491
600 239393 72142021
700 254776 80880102
800 328867 136060133
900 242250 90509565
1000 198006 63720377
1100 172178 40326850
1200 166241 40838334
1300 227917 65642557
1400 196942 46592419
1500 131134 31051510


## Extinct Words

### Create CSV Files with Token Frequency

In [3]:
from collections import Counter
        
def corpus_word_frequencies_to_file(corpus_filename, out_file, min_count=10):
    freq = Counter()

    # Avoid loading the whole corpus into memory
    with open(corpus_filename, 'r', encoding='utf-8') as f:
        [freq.update(line.split()) for line in f]

    with open(out_file, 'w', encoding='utf-8') as corpus_summary:
        fieldnames = ['word', 'frequency']
        writer = csv.DictWriter(corpus_summary, fieldnames=fieldnames)
        writer.writeheader()
        for k, v in OrderedDict(freq.most_common()).items():
            if v >= min_count:
                writer.writerow({"word": k, "frequency": v})
        print(out_file + " contains: + " + str(len(freq)) + " unique tokens")
        
for century in list(range(100, 1600, 100)):
    filename = "data/compiled/corpus_" + str(century) + "AH_nonstop"
    out_file = "data/summary/corpus_" + str(century) + "AH_summary.csv"
    print("Starting on file name: " + filename)
    corpus_word_frequencies_to_file(filename, out_file)

### Load in the Tokens and Frequencies into Dataframes/Sets

In [5]:
import pandas as pd

directory = "data/summary/"

df_100AH =  pd.read_csv(directory + "corpus_100AH_summary.csv")
df_200AH =  pd.read_csv(directory + "corpus_200AH_summary.csv")
df_300AH =  pd.read_csv(directory + "corpus_300AH_summary.csv")
df_400AH =  pd.read_csv(directory + "corpus_400AH_summary.csv")
df_500AH =  pd.read_csv(directory + "corpus_500AH_summary.csv")
df_600AH =  pd.read_csv(directory + "corpus_600AH_summary.csv")
df_700AH =  pd.read_csv(directory + "corpus_700AH_summary.csv")
df_800AH =  pd.read_csv(directory + "corpus_800AH_summary.csv")
df_900AH =  pd.read_csv(directory + "corpus_900AH_summary.csv")
df_1000AH = pd.read_csv(directory + "corpus_1000AH_summary.csv")
df_1100AH = pd.read_csv(directory + "corpus_1100AH_summary.csv")
df_1200AH = pd.read_csv(directory + "corpus_1200AH_summary.csv")
df_1300AH = pd.read_csv(directory + "corpus_1300AH_summary.csv")
df_1400AH = pd.read_csv(directory + "corpus_1400AH_summary.csv")
df_1500AH = pd.read_csv(directory + "corpus_1500AH_summary.csv")

set_100AH =  set(df_100AH.word)
set_200AH =  set(df_200AH.word)
set_300AH =  set(df_300AH.word)
set_400AH =  set(df_400AH.word)
set_500AH =  set(df_500AH.word)
set_600AH =  set(df_600AH.word)
set_700AH =  set(df_700AH.word)
set_800AH =  set(df_800AH.word)
set_900AH =  set(df_900AH.word)
set_1000AH = set(df_1000AH.word)
set_1100AH = set(df_1100AH.word)
set_1200AH = set(df_1200AH.word)
set_1300AH = set(df_1300AH.word)
set_1400AH = set(df_1400AH.word)
set_1500AH = set(df_1500AH.word)

all_sets = [set_100AH, set_200AH, set_300AH, set_400AH, set_500AH, set_600AH, set_700AH, set_800AH, set_900AH, set_1000AH, 
            set_1100AH, set_1200AH, set_1300AH, set_1400AH, set_1500AH]

### Find the extinct words

In [6]:
def find_extinct_words(starting_list, century_start):
    for word in list(starting_list):
        for s in all_sets[century_start:]:
            if word in s:
                starting_list.discard(word)
    return starting_list

# Starting list is words that appeared in a century but not the subsequent century
extinct_100AH =  find_extinct_words(set.difference(set_100AH, set_200AH), 2)
extinct_200AH =  find_extinct_words(set.difference(set_200AH, set_300AH), 3)
extinct_300AH =  find_extinct_words(set.difference(set_300AH, set_400AH), 4)
extinct_400AH =  find_extinct_words(set.difference(set_400AH, set_500AH), 5)
extinct_500AH =  find_extinct_words(set.difference(set_500AH, set_600AH), 6)
extinct_600AH =  find_extinct_words(set.difference(set_600AH, set_700AH), 7)
extinct_700AH =  find_extinct_words(set.difference(set_700AH, set_800AH), 8)
extinct_800AH =  find_extinct_words(set.difference(set_800AH, set_900AH), 9)
extinct_900AH =  find_extinct_words(set.difference(set_900AH, set_1000AH), 10)
extinct_1000AH = find_extinct_words(set.difference(set_1000AH, set_1100AH), 11)
extinct_1100AH = find_extinct_words(set.difference(set_1100AH, set_1200AH), 12)
extinct_1200AH = find_extinct_words(set.difference(set_1200AH, set_1300AH), 13)
extinct_1300AH = find_extinct_words(set.difference(set_1300AH, set_1400AH), 14)

df_extinct_100AH = df_100AH[df_100AH.word.isin(list(extinct_100AH))]
df_extinct_200AH = df_200AH[df_200AH.word.isin(list(extinct_200AH))]
df_extinct_300AH = df_300AH[df_300AH.word.isin(list(extinct_300AH))]
df_extinct_400AH = df_400AH[df_400AH.word.isin(list(extinct_400AH))]
df_extinct_500AH = df_500AH[df_500AH.word.isin(list(extinct_500AH))]
df_extinct_600AH = df_600AH[df_600AH.word.isin(list(extinct_600AH))]
df_extinct_700AH = df_700AH[df_700AH.word.isin(list(extinct_700AH))]
df_extinct_800AH = df_800AH[df_800AH.word.isin(list(extinct_800AH))]
df_extinct_900AH = df_900AH[df_900AH.word.isin(list(extinct_900AH))]
df_extinct_1000AH = df_1000AH[df_1000AH.word.isin(list(extinct_1000AH))]
df_extinct_1100AH = df_1100AH[df_1100AH.word.isin(list(extinct_1100AH))]
df_extinct_1200AH = df_1200AH[df_1200AH.word.isin(list(extinct_1200AH))]
df_extinct_1300AH = df_1300AH[df_1300AH.word.isin(list(extinct_1300AH))]

### Combine all results and export to CSV

In [7]:
pd.options.mode.chained_assignment = None 

# Add a column to each dataframe indicating which century it came from
df_extinct_100AH['century'] = '100AH'
df_extinct_200AH['century'] = '200AH'
df_extinct_300AH['century'] = '300AH'
df_extinct_400AH['century'] = '400AH'
df_extinct_500AH['century'] = '500AH'
df_extinct_600AH['century'] = '600AH'
df_extinct_700AH['century'] = '700AH'
df_extinct_800AH['century'] = '800AH'
df_extinct_900AH['century'] = '900AH'
df_extinct_1000AH['century'] = '1000AH'
df_extinct_1100AH['century'] = '1100AH'
df_extinct_1200AH['century'] = '1200AH'
df_extinct_1300AH['century'] = '1300AH'

result = pd.concat([df_extinct_100AH, df_extinct_200AH, df_extinct_300AH, df_extinct_400AH, 
                    df_extinct_500AH, df_extinct_600AH, df_extinct_700AH, df_extinct_800AH,
                    df_extinct_900AH, df_extinct_1000AH, df_extinct_1100AH, df_extinct_1200AH, df_extinct_1300AH])
# Export to CSV
result = result.sort_values(by=['frequency'], ascending=False)

result.to_csv("data/extinct_words.csv", encoding="utf-8")
result.head()

Unnamed: 0,word,frequency,century
1381,قرز,9204,900AH
1103,علاى,8337,1000AH
1796,طح,7185,900AH
1530,إلاى,6154,1000AH
2341,وبالأصل,4283,1300AH


In [8]:
# Top 3 results only
result_summary = pd.concat([df_extinct_100AH[:1], df_extinct_200AH[:1], df_extinct_300AH[:1], df_extinct_400AH[:1], 
                    df_extinct_500AH[:1], df_extinct_600AH[:1], df_extinct_700AH[:1], df_extinct_800AH[:1],
                    df_extinct_900AH[:1], df_extinct_1000AH[:1], df_extinct_1100AH[:1], df_extinct_1200AH[:1], df_extinct_1300AH[:1]])

# result_summary.to_csv("data/extinct_words_top3.csv", encoding="utf-8")
result_summary

Unnamed: 0,word,frequency,century
2497,نفساه,35,100AH
4318,أتحفظه,272,200AH
17956,هربيس,228,300AH
14945,واللوحة,514,400AH
7018,دبيثى,1543,500AH
3140,وخع,3008,600AH
18749,بدليلهما,482,700AH
13160,ورخه,1225,800AH
1381,قرز,9204,900AH
1103,علاى,8337,1000AH


## Popular Words Over Time

In [1]:
import csv
import itertools

# Grab the top N results from each word/frequency document
def get_most_frequent(file_name, N=2):
    with open(file_name, "r", encoding="utf-8") as f:
        for row in itertools.islice(csv.DictReader(f), N):
            yield row
            
directory = "data/summary/"
output_file = "data/summary/popular_words.csv"


out = open(output_file, "w", encoding='utf-8', newline="")
writer = csv.DictWriter(out, fieldnames=["Word", "Frequency", "Century (AH)"])
writer.writeheader()

for century in list(range(100, 1600, 200)):
    corpus_name = "corpus_" + str(century) + "AH_summary.csv"
    for res in get_most_frequent(directory + corpus_name):
        writer.writerow({"Word": res['word'], "Frequency": res['frequency'], "Century (AH)": century})
out.close()

In [3]:
import pandas as pd

df = pd.read_csv("data/summary/popular_words.csv")
df

Unnamed: 0,Word,Frequency,Century (AH)
0,الله,10975,100
1,السلام,3203,100
2,الله,2259572,300
3,حدثنا,894564,300
4,الله,2642254,500
5,أبو,1021880,500
6,الله,1846765,700
7,أبو,524548,700
8,الله,2225896,900
9,أبي,821922,900
