# Task 01 – Text Pre-Processing

In [2]:
# 本块应该可以不运行，给看一下有就行了。反正就是用之前的函数下载两个txt文件（原来程序直接保存为地址上的文件名，没有用剧名保存）

# copied from 04

import urllib
import logging

# defined in Task 05
def init_log(file_name, file_mode, level, format, date_format):
    logging.basicConfig(filename=file_name,
                        filemode=file_mode,
                        level=level,
                        format=format,
                        datefmt=date_format)

def download_file(url, path):
    # check whether the urls points to a .txt file by verifying its suffix
    if url.endswith(".txt"):
        try:
            # open the link
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} 
            req = urllib.request.Request(url=url,headers=headers)
            f = urllib.request.urlopen(req)
            
            # get the name of txt file:   https://www.google.com/s/123.txt ---> 123.txt
            url=url[::-1] # reverse the url:  txt.321/s/moc.elgoog.www//:sttph
            x=url.index("/")  # the index of the first "/"
            filename=url[:x]  # cut the string and get the left slice:  txt.321
            filename=filename[::-1]  # reverse :123.txt
            
            # read data and write it into a file
            data = f.read()
            with open(path+filename, 'wb') as f2:
                f2.write(data)
                
        # exception: invalid link
        except urllib.error.URLError:
            print(f"Error: cannot access the link {url}")
        # exception: target path invalid
        except FileNotFoundError:
            print("Error: the given path does not exist")
            
    # does not point to a txt file
    else:
        logging.error("No text file found at given URL, download aborted!")  # redefine the error message
        

init_log("task05.log","a",logging.DEBUG,'%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',"%Y-%m-%d %H:%M:%S")
path="./"  # the current folder


# 0. download the plain text versions of Shakespeare’s play Macbeth and Bacon’s New Atlantis.


url1="https://ia802707.us.archive.org/1/items/macbeth02264gut/0ws3410.txt"
url2="https://ia801309.us.archive.org/24/items/newatlantis02434gut/nwatl10.txt"

download_file(url1, path)
download_file(url2, path)

In [9]:
# 1. get_speaker_text()- returns only the text spoken by the characters in the plays and removes all other text

TITLES=["THE NEW ATLANTIS","The Tragedie of Macbeth"]

def get_speaker_text(text):
    # Atlantis
    if "THE NEW ATLANTIS" in text:
        # extract only the main body
        i=text.index("THE NEW ATLANTIS")
        l=len("THE NEW ATLANTIS")
        text=text[i+l:]
        j=text.index("[The rest was not perfected.]")
        text=text[:j]
    # Macbeth
    else:
        i=text.index("The Tragedie of Macbeth")
        l=len("The Tragedie of Macbeth")
        text=text[i+l:]
        lines=text.split("\n") # separate lines
        lines=[line.strip() for line in lines]  # remove extra space and \n
        text=""
        for line in lines:
            # ignore scene instructions
            if "Scena" in line or "Enter" in line or "Exeunt" in line:  
                continue
            # remove characters' names (a "." after the character:  king. his words)
            if "." in line:
                i=line.index(".")
                line=line[i+1:]  # ignore the substring before .
            text=text+" "+line
    return text
    
        
    

In [10]:
# 2. normalize_text()


def normalize_text(text,correct=False):
    import string
    import utils_ocr
    
    text=text.lower()  #converts all text to lower case
    
    # if necessary: ocr error correction
    if correct:
        text=utils_ocr.correct_ocr_errors(text)
    
    # removes all punctuation from the texts
    for p in string.punctuation:
        text=text.replace(p," ")
    
    return text

In [11]:
# 3 remove_stopwords()
def remove_stopwords(text):
    text=text.replace("\n"," ")  # merge all text into one line
    
    # read stopwords from the given file
    with open("eng_stop_words.txt","r") as f:
        stopwords=f.readlines()
    
    stopwords=[x.strip() for x in stopwords] # remove extra space or "\n" for each word
    # remove each stopword
    for word in stopwords:
        text=text.replace(" "+word+" "," ")  # add two spaces beside each stopword to ensure the matched substring is a word
    
    return text

In [12]:
# 4. tokenize_text() – splits the cleaned text into words
def tokenize_text(text):
    words=text.split()  # divide the long text string into words by space
    return words

In [13]:
# test 
ftest="Macbeth.txt"
with open(ftest,"r") as f:
    testtext=f.read()
testtext=get_speaker_text(testtext)
testtext=normalize_text(testtext)
testtext=remove_stopwords(testtext)
testwords=tokenize_text(testtext)

# Task 02 – Classes

In [14]:
# 1

# define a class to process the txt file
class TextWords:
    def __init__(self,filename,ocr=False):
        # 0. get the text from file
        with open(filename,"r") as f:
            self.text=f.read()
        # pre-process
        self.text=get_speaker_text(self.text)      #1
        self.text=normalize_text(self.text,ocr)    #2
        self.text=remove_stopwords(self.text)      #3
        self.words=tokenize_text(self.text)        #4


In [15]:
filename1="NewAtlantis.txt"
filename2="Macbeth.txt"

newatlantis=TextWords(filename1)
macbeth=TextWords(filename2,True)  # Macbeth needs to correct ocr errors

In [16]:
# 2 
words1=set(newatlantis.words)
words2=set(macbeth.words)
common=words1&words2    # the intersections of two sets contains the common elements shared by two sets

fre=[]  # the frequency list of common words
for term in common:
    cnt1=newatlantis.words.count(term)  # frequency_doc1
    cnt2=macbeth.words.count(term)  # frequency_doc2
    fre.append([term,cnt1,cnt2,cnt1+cnt2])  # [term , frequency_doc1 , frequency_doc2 , sum_of_frequencies]

fre.sort(key=lambda x:x[3],reverse=True)   # sort the list by sum of the frequencies in descending order

In [17]:
# 3
import csv

csvname="fre.csv"

with open(csvname,"w",newline="") as f:
    wr=csv.writer(f)
    wr.writerows(fre)
