Script to read in the artile txt files and process text data

In [1]:
import os
import pandas as pd
import codecs 
import re
import utf8tochar
from datetime import datetime
import multiprocessing as mp

Our folder structure is something like this:

```
Salmon_Market_News
|
|---analysis:
        \read_txt_sorting_ts.ipynb
|---scraping:
    |---articles:
        |---article_text
            \article1.txt
            \article2.txt
            .
            .
            .
```

Use os module to specify the current path, go up one, then down the scraping path to the `article_text` folder. 

In [2]:
# path of this notebook
path = os.path.realpath(__name__)
# dirname gives the folder the notebook is in, we replace analysis with scraping
drt = os.path.dirname(path).replace('analysis', 'scraping')
# now we can go down in the sraping branch to find the article texts
textpath = os.path.join(drt, 'articles/article_text')

In [3]:
# list all the files in the folder
files = os.listdir(textpath)

Titles in certain blog posts were mistakenly split, with parts ending up in the article body due to the presence of '--' characters. Here we fix this.

In [4]:
# load the CSV file containing titles needing correction
corrected_titles = pd.read_csv('corrected_titles.csv', encoding = 'utf-8', header = None)
# extract the day from the date string
day_correct = [date.split(' ')[0].split('-')[2] for date in corrected_titles[0][:-2]]
# extract the month from the date string
month_correct = [date.split(' ')[0].split('-')[1] for date in corrected_titles[0][:-2]]
# extract the year from the date string
year_correct = [date.split(' ')[0].split('-')[0] for date in corrected_titles[0][:-2]]
# extract the first part of the title before '--'
first_part = [t.split('--')[0] for t in corrected_titles[1][:-2]]
# join all parts after the first '--'
second_part = [''.join(t.split('--')[1:]) for t in corrected_titles[1][:-2]]
# extract all titles
correct_title = [t for t in corrected_titles[1][:-2]]
# combine all the extracted information into a list of tuples
to_correct = list(zip(day_correct, month_correct, year_correct, first_part, second_part, correct_title))

Each text file is parsed to extract and format key elements such as the publication date, title, and article body. Titles from the blogs that were incorrectly formatted are corrected here. We also remove publication dates from all blog articles. The text of each article is compiled from the title, lead, and body. Text cleaning involves removing multiple spaces, non-breaking spaces, backslashes, tabs, and carriage returns. Additionally, we correct unicode errors.

In [5]:
%%time

day = []
hour = []
minute = []
month = []
year = []
titles = []
texts = []

# list of time zones
tz = ['PST', 'EST', 'GMT', 'CST', 'ICT', 'CET', 'CT']
ampm = ['am', 'pm', 'a\.m\.', 'p\.m\.']
char_remove = ["\t", "\r", "\\"]

for f in files: 
    with codecs.open(os.path.join(textpath, f), "r", encoding = 'utf-8' ) as f:
        f_data = f.read().split('\n')
        # extract the date, strip leading and trailing whitespaces
        date = ':'.join(f_data[0].split(':')[1:]).strip()
        # day of the publication
        day.append(date.split()[0].split('-')[2])
        # month of the publication
        month.append(date.split()[0].split('-')[1])
        # year of the publication
        year.append(date.split()[0].split('-')[0])
        # hour of publication
        hour.append(date.split()[1].split(':')[0])
        # minute of publication
        minute.append(date.split()[1].split(':')[1])
        # extract the title
        title = ':'.join(f_data[1].split(':')[1:]).strip()
        # remove tab and carriage return
        title = title.replace("\t", '').replace("\r", '')
        for correct in to_correct:
            # check if the current title and date match the erroneous title and its respective date parts 
            # from the 'to_correct' list
            if (title == correct[3].strip()) and (date.split()[0].split('-')[2] == correct[0]) and \
            (date.split()[0].split('-')[1] == correct[1]) and (date.split()[0].split('-')[0] == correct[2]):
                # if they match, replace the current title with the fully corrected title
                title = correct[5]
        # if the title is not an empty string and there is no period, colon, semicolon, exclamation, or question mark
        # at the end of the sentence, add the period.
        if title != '':
            if title[-1] not in ['.', '!', ':', ';', '?']:
                title = title + '.'
        titles.append(title)
        # extract the lead
        lead = ':'.join(f_data[3].split(':')[1:]).strip()
        # extract the body
        body = ':'.join(f_data[4].split(':')[1:]).strip()
        for correct in to_correct:
            # check if the article body starts with the misformatted part of the title (correct[4])
            # and if the date of the article matches the date components (year, month, day) from the tuple
            if (body.startswith(correct[4].strip())) and (date.split()[0].split('-')[2] == correct[0]) and \
            (date.split()[0].split('-')[1] == correct[1]) and (date.split()[0].split('-')[0] == correct[2]):
                # if the conditions are met, replace the misformatted part in the body with an empty string,
                # effectively removing it
                body = body.replace(correct[4].strip(), '')
        # remove the date from the separated blog article
        if title == '':
            if len(re.findall(r".*(?:" + '|'.join(tz) + r")", body)) > 0:
                date_remove = re.findall(r".*(?:" + '|'.join(tz) + r")", body)[0]
            else:
                date_remove = re.findall(r".*(?:" + '|'.join(ampm) + r")", body)[0]
            body = body.split(date_remove, 1)[1].strip()            
        # combine title, lead, and body into a text
        if re.sub(r'\s{2,}', ' ', lead) not in re.sub(r'\s{2,}', ' ', body):
            text = title + ' ' + lead + ' ' + body
        else:
            text = title + ' ' + body
        # strip whitespace on both sides
        text = text.strip()
        # simple text cleaning
        # remove tab, carriage return, and backslashes
        for ch in char_remove:
            text = text.replace(ch, "")
        # correct some unicode errors
        for k,v in utf8tochar.utf8tochar_no_backslash.items():
            text = text.replace(k,v)
        # correct some unicode errors
        for k,v in utf8tochar.encod_mistake_tochar.items():
            text = text.replace(k,v) 
        # remove multiple spaces: {2,} means at least 2 repeats
        text = re.sub(r'\s{2,}', ' ', text)
        # replace non-breaking spaces with a space
        text = re.sub(r'\xa0', ' ', text)
        # strip whitespace on both sides
        text = text.strip()
        texts.append(text)
        
# convert string to integer
day = list(map(int, day))
month = list(map(int, month))
year = list(map(int, year))      
hour = list(map(int, hour))
minute = list(map(int, minute))

CPU times: total: 8.66 s
Wall time: 14.8 s


In [6]:
# now we will create a DataFrame 
import pandas as pd
data = pd.DataFrame({
    'day': day,
    'month': month,
    'year': year,
    'minute': minute,
    'hour': hour,
    'titles': titles,
    'texts' :texts
})

In [7]:
# remove articles published before 2016
data = data[data.year>=2016]
# remove articles with no text
data = data[data.texts != '']
# sort the articles by year, month, day, hour, minute
data = data.sort_values(['year', 'month', 'day', 'hour', 'minute'], ascending=[True, True, True, True, True]) # sort the data in chronological order
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame
print(len(data))

6183


Articles that contain the string '...(read more )' are introductions only, they do not contain the full text.

In [8]:
data.drop(data[(data['texts'].str.contains('''...(read more )''', na = False, regex = False))].index, inplace=True)
data.reset_index(inplace=True, drop=True)

In some cases, a period and the word following it are mistakenly merged together. To ensure that all texts have the expected format, we correct this mistake.

In [9]:
# Set the number of cores to use
NUM_CORE = mp.cpu_count()-4
startTime = datetime.now()

import split_period_word

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    texts_corrected = pool.map(split_period_word.split_period_word, [text for text in data['texts']]) 
    pool.close()
    pool.join()

print(datetime.now()-startTime)
data['texts'] = texts_corrected

0:00:01.913444


Similar to stopwords, there are strings that do not help us determine sentiment or topic of the text. Therefore, we remove them.

In [10]:
startTime = datetime.now()

import clean_text

data = clean_text.delete_articles_with_strings(clean_text.delete_article_strings, data)

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    cleaned_articles = pool.map(clean_text.clean_text, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)
data['texts'] = cleaned_articles

0:00:10.398403


Identify name candidates using spacy. We use these candidates to find the names of IntraFish journalists that we later delete.

To install spaCy and a small English pipeline via conda:

`conda install -c conda-forge spacy`

`python -m spacy download en_core_web_sm`

In [11]:
startTime = datetime.now()

import identify_names
if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    names = pool.map(identify_names.identify_names, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)

0:02:37.529003


In [12]:
names = [item for sublist in names for item in sublist]
# List of unique names
names = list(set(names))

In [13]:
name_freq = dict()
for name in names:
    if '(' in name:
        name = name.split('(')[0]
    elif ')' in name:
        name = name.split(')')[0]
    name_freq[name] = len(data[data.texts.str.contains(name)])

In [14]:
sorted_dict = sorted(name_freq.items(), key=lambda item: item[1], reverse=True)

In [15]:
print(sorted_dict[900:905])

[('Hvistendahl', 9), ('Blumar Seafoods', 9), ('Fellow', 9), ('Nilsen', 9), ('Bradley', 9)]


Correct some misspellings produced by the funciton `split_period_word`.

In [16]:
clean_texts = []
for text in data.texts:
    # Inc. not
    text = re.sub('\. not(?!e)', '.not', text)     
    # DNBNXT. no
    text = re.sub('\. no(?!w|te)', '.no', text)
    # JD. com
    text = re.sub('\. com', '.com', text) 
    # Portalspozywczy. pl
    text = re.sub('\. pl(?!ay|astic)', '.pl', text)
    # insider. co.uk
    text = re.sub('\. co(?!nventional|nsumers)', '.co', text)
    # Mathem. se
    text = re.sub('\. se(?!afood)', '.se', text)   
    # Kuow. org
    text = re.sub('\. org(?!afood)', '.org', text)
    # abc. net
    text = re.sub('\. net(?!afood)', '.net', text)
    # biobiochile. cl
    text = re.sub('\. cl(?!ams)', '.cl', text)  
    # C. opilio
    text = re.sub('\. opilio', '.opilio', text)
    # oursafety. info
    text = re.sub('\. info', '.info', text)
    # sudouest. fr
    text = re.sub('\. fr(?!om)', '.fr', text)
    # indepedent. ie
    text = re.sub('\. ie', '.ie', text)
    # dbrs. dk
    text = re.sub('\. dk', '.dk', text)
    # . per kilo
    text = re.sub('\.  per kilo', '.', text)
    clean_texts.append(text)
data['texts'] = clean_texts

Remove short texts

In [17]:
startTime = datetime.now() 

import count_words_mp # import the function calculating the number of words in a text

if __name__ == "__main__":
    pool = mp.Pool(NUM_CORE)
    count_results = pool.map(count_words_mp.count_words_mp, [text for text in data['texts']]) 
    pool.close()
    pool.join()
    
print(datetime.now()-startTime)
# Save the result as a new column "word_count"
data['word_count'] = count_results

0:00:01.678262


Articles with 20 or less words are deleted.

In [18]:
data = data[(data.word_count>20)]
data.reset_index(inplace=True, drop=True) # reset the index of the DataFrame
print(len(data))

6082


In [19]:
# Save cleaned data:
drt_1 = os.path.dirname(path).replace('analysis', 'finance data')
data.to_csv(os.path.join(drt_1,'articles_sorted.csv'), encoding = 'utf-8-sig', sep = ';')