# CISC 6210 NLP HW1 by Zhiyu Xu

## Introduction.
The whole homework, in my design, has been splited into 3 parts:
- Remotely download the poem link and save them locally.
- Read the url file and get the poem text, then according to the task, finish the CleanOutputLoveOutput table.
- Based on the clean table, finish the rest 4 ProcessedLoveOutput table.

## Part 1. Get the poem link.
The source page has all those poem files. I don't know if the storm server has any better ways for users to download the files from the server. My solution is to use the python web crawler package requests and bs4, to get the postfix of those poems' url, which happens to be their names.

After I get those name, I save them into a local text file. In the following step, I read lines from that file and combine them with the prefix: 'https://storm.cis.fordham.edu/~yli/data/LoveOutput/'. After that, I can get all the contents I want to continue the task.

In [3]:
import requests
from bs4 import BeautifulSoup
import re

# Based on the url and BeautifulSoup, get the soup object
def get_soup(url):
    html = requests.get(url)
    txt = html.text
    soup = BeautifulSoup(txt, 'lxml')
    return soup

# Use the regular expression, find the poem file names we need from the html.
def get_link(url):
    soup = get_soup(url)
    text = soup.findAll('table')[0].findAll('td')
    rows = []
    for x in text:
        rows.append(str(x))
    valid = []
    for row in rows:
        pattern = '<a href=.+</a>'
        result = re.search(pattern, row)
        if result:
            link = result.group(0)
            link = link.strip('"')
            valid.append(link)
    return valid

# The prefix of all poem urls, which follow by the poem name.
url = 'https://storm.cis.fordham.edu/~yli/data/LoveOutput/'

valid = get_link(url)
with open('/Users/lordxuzhiyu/Desktop/valid_ip.txt', 'a') as f:
    for item in valid:
        f.write("%s\n" % item)

The code above would create a local text file with all the poem names. I put that file in my submission folder.
You need to change the local file path to test on your environment.

## Part 2. CleanOutputLoveOutput table.
In this part, we need to combine the url with the local file first.

We would use the iteration, for each line of the url file, we test the usability of the url, if it works, we follow the instruction to get all the information we need, and then save them into a row of record in a pandas DataFrame; if it doesn't work, we treat it as an exception and then jump to the next line of the file.

As for the pandas operation, I would put proper annotations in the codes.

In [4]:
import pandas as pd
import requests
import re

# Combine the prefix url0 with the local poem name to get the full url of each poem.
def combine_url(url1, url2):
    return '%s%s' % (url1, url2)

def get_text(url, head):
    html = requests.get(url, headers = head)
    txt = html.text
    return txt

head = {
    'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
}

# url0 is the prefix.
url0 = 'https://storm.cis.fordham.edu/~yli/data/LoveOutput/'

table0 = pd.DataFrame(columns = ['Author', 'Title', 'Tags', 'Body', 'Link'])
print(table0)

Empty DataFrame
Columns: [Author, Title, Tags, Body, Link]
Index: []


In [7]:
fails = 0

# Use the iteration to access the url file.
for line in open('/Users/lordxuzhiyu/Desktop/valid_ip.txt'):
    url = combine_url(url0, line)
    url = url.replace(' ', '').replace('\n', '').replace('\t', '').replace('\r', '').strip()
    try:
        # First step to test the usability of the url.
        p = requests.get(url, headers = head, timeout = 5)
        
        html = requests.get(url,headers=head)
        txt = html.text
        text = txt.splitlines()
        
        # Title
        title = re.findall(r'(\w+) By', text[0]) 
        # Author
        author = re.findall(r'By (.+)', text[0])
        # Tags
        tag = text[2].split(' ;')
        tag = ', '.join(tag)
        # Body
        body = text[4].replace('<br><br>', '[P]')
        body = body.replace('<br>', '[L]')
        # Link
        link = re.findall(r'http.+', text[6])  
        
        record = {'Author': author[0], 'Title': title, 'Tags': tag, 'Body': body, 'Link': link[0]}
        temp = pd.DataFrame(record, index = [0])
        table0 = table0.append(temp, ignore_index = True)
    
    except Exception as error:
        fails += 1
        #print(error)
        continue

# The total failure urls.
print('Fails: ', fails)

# sum of poems
print('sum of poems', len(table0.index))

# sum of different authors
sum_author = len(table0['Author'].unique().tolist())
print('sum of different authors', sum_author)
 
# author sorted
print(table0['Author'].value_counts())

# fully description
print(table0.describe())

# Save the dataframe to local csv file, then transform it to the xlsx file.
table0.to_csv('/Users/localpath/CleanOutputLoveOutput.csv', 
              index = None, header = True)

Fails:  422
sum of poems 5439
sum of different authors 906
William Shakespeare                        123
John Donne                                  87
Robert Browning                             57
Edmund Spenser                              57
Sir  Thomas Wyatt                           54
Anonymous                                   42
Thomas Campion                              42
Algernon Charles Swinburne                  42
John Keats                                  39
George Meredith                             39
William Butler Yeats                        36
Aphra Behn                                  33
Alfred, Lord Tennyson                       33
Andrew Marvell                              33
Amy Lowell                                  33
Robert Herrick                              30
Sara Teasdale                               30
Brenda Shaughnessy                          30
Louise Gl�ck                                30
Robert Burns                                30
D

## Part 3. ProcessedLoveOutput table.
Use the nltk package to finish the requirements.
This part is actually easy and clear.

For the punctuation rule part, I use the regular expression. e.g. "I'm", the colon would be seen as a punction mark.

In [8]:
import pandas as pd
import re
from nltk import word_tokenize
from nltk import sent_tokenize

# read the local file
clean = pd.read_csv('/Users/lordxuzhiyu/Desktop/CleanOutputLoveOutput.csv')

# create a dataframe to store the record
df = pd.DataFrame(columns = ['PoemID', 'Author', 'LengthOne', 'LengthTwo',
                             'NumLine', 'NumPara', 'NumSent', 'NumComma'])

for index, row in clean.iterrows():
    #print(index)
    poemID = index
    author = row['Author']
    
    tokens = tokens = word_tokenize(row['Body'])
    len1 = len(set(tokens))
    
    # regular expression pattern to match the non-punctuation tokens.
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    tokens2 = [w for w in tokens if nonPunct.match(w)]
    len2 = len(set(tokens2))
    
    p = row['Body'].count('[P]')
    l = row['Body'].count('[L]')
    numL = l + p
    numP = p
    
    sen = sent_tokenize(row['Body'])
    comma = row['Body'].count(',')
    
    raw = {'PoemID': poemID, 'Author': author, 'LengthOne': len1,
              'LengthTwo': len2, 'NumLine': numL, 'NumPara': numP, 
              'NumSent':len(sen), 'NumComma': comma}
    record = pd.DataFrame(raw, index = [0])
    
    df = df.append(record, ignore_index = True)

print(df)
df.to_csv('/Users/lordxuzhiyu/Desktop/ProcessedLoveOutput0.csv', 
              index = None, header = True)  

     PoemID                      Author LengthOne LengthTwo NumLine NumPara  \
0         0           Suzanne Gardinier        76        74      14       7   
1         1          Jeff Daniel Marion        72        67      16       1   
2         2               Roddy Lumsden       205       198      32      16   
3         3               Dennis Cooper       130       120      39       1   
4         4               Kathryn Maris       188       175      24       8   
5         5                Thomas Moore       203       191      40       8   
6         6          Christina Rossetti        74        69      17       2   
7         7             Jane Hirshfield       108       103      23       1   
8         8               Lewis Carroll        88        81      21       7   
9         9          Robert VanderMolen       139       132      38      22   
10       10                 Cate Marvin       202       194      35       5   
11       11                 Leigh Stein       171   

In [None]:
## The rest three tables.

import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

clean = pd.read_csv('/Users/localpath/CleanOutputLoveOutput.csv')

df1 = pd.DataFrame(columns = ['PoemID', 'Author', 'Body', 'Length', 'UniCount'])
df2 = pd.DataFrame(columns = ['PoemID', 'Author', 'Body', 'Length', 'UniCount'])
df3 = pd.DataFrame(columns = ['PoemID', 'Author', 'Body', 'Length', 'UniCount'])

for index, row in clean.iterrows():
    print(index)
    poemID = index
    author = row['Author']
    
    tokens = word_tokenize(row['Body'])
    body1 = ' '.join(tokens)
    len1 = len(tokens)
    uni1 = len(set(tokens))
    
    clean_tokens = list()
    sr = stopwords.words('english')
    for token in tokens:
        if not token in sr:
            clean_tokens.append(token)
    body2 = ' '.join(clean_tokens)
    len2 = len(clean_tokens)
    uni2 = len(set(clean_tokens))
    
    tab3 = []
    ps = PorterStemmer() 
    stems = clean_tokens
    for w in stems:
        w = ps.stem(w)
        tab3.append(w)
    body3 = ' '.join(tab3)
    len3 = len(tab3)
    uni3 = len(set(tab3))   
    
    raw1 = {'PoemID': poemID, 'Author': author, 'Body': body1,
            'Length': len1, 'UniCount': uni1}
    record1 = pd.DataFrame(raw1, index = [0])
    df1 = df1.append(record1, ignore_index = True)
    
    raw2 = {'PoemID': poemID, 'Author': author, 'Body': body2,
            'Length': len2, 'UniCount': uni2}
    record2 = pd.DataFrame(raw2, index = [0])
    df2 = df2.append(record2, ignore_index = True)
    
    raw3 = {'PoemID': poemID, 'Author': author, 'Body': body3,
            'Length': len3, 'UniCount': uni3}
    record3 = pd.DataFrame(raw3, index = [0])
    df3 = df1.append(record3, ignore_index = True)

df1.to_csv('/Users/lordxuzhiyu/Desktop/ProcessedLoveOutput1.csv', 
              index = None, header = True)  
df2.to_csv('/Users/lordxuzhiyu/Desktop/ProcessedLoveOutput2.csv', 
              index = None, header = True)  
df3.to_csv('/Users/lordxuzhiyu/Desktop/ProcessedLoveOutput3.csv', 
              index = None, header = True)    