# Overview 

This notebook shows how to perform the preprocessing of raw text data from scientific publications.

See the Zenodo repository documentation:

See the paper:

In [33]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(1, '../scripts/')

import preprocessing
import pandas as pd
from tqdm.notebook import tqdm

Reading stopwords...
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\u0152835\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import csv

print('Get the number of papers to process...')
with open('../data/raw/papers_raw.csv', 'r', encoding = 'utf-8') as file:
    line_count = sum(1 for line in file)

# Subtract 1 for the header if the CSV has a header
total_papers = line_count - 1


print('Preparing for writing...')
words_write = open('../data/processed/papers_words.csv','w')
words_write.write('PaperID,Words_Title,Words_Abstract\n') # write the first line for the headers
bigrams_write = open('../data/processed/papers_bigrams.csv','w')
bigrams_write.write('PaperID,Bigrams_Title,Bigrams_Abstract\n') # write the first line for the headers
trigrams_write = open('../data/processed/papers_trigrams.csv','w')
trigrams_write.write('PaperID,Trigrams_Title,Trigrams_Abstract\n') # write the first line for the headers


print('Processing...')
with open('../data/raw/papers_raw.csv', 'r', encoding='utf-8') as reader:
    csv_reader = csv.reader(reader, delimiter='\t', quotechar='"')
    
    # Skip header
    next(csv_reader)

    for line in tqdm(csv_reader, total = total_papers):
        
        writing_words = line[0] # add the PaperID
        writing_bigrams = line[0] # add the PaperID
        writing_trigrams = line[0] # add the PaperID
        
        for text in [line[2], line[3]]:  # loop over title and abstract
            
            # preprocess text (either title or abstract)            
            unigrams_non_processed = preprocessing.get_unigrams(text, processed=False)
            unigrams = preprocessing.get_unigrams(text, processed=True)

            bigrams = preprocessing.get_bigrams(unigrams_non_processed)
            trigrams = preprocessing.get_trigrams(unigrams_non_processed)
            
            writing_words += ',' + ' '.join(unigrams)
            writing_bigrams += ',' + ' '.join(bigrams)
            writing_trigrams += ',' + ' '.join(trigrams)
            
        
        words_write.write(writing_words + '\n')
        bigrams_write.write(writing_bigrams + '\n')
        trigrams_write.write(writing_trigrams + '\n')
            
# close the file
words_write.close()
bigrams_write.close()      
trigrams_write.close()
            
