In [3]:
import re, codecs
from os import walk
import csv
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk import pos_tag

In [4]:
def pt_to_wn(pos):
    """
    Takes a Penn Treebank tag and converts it to an
    appropriate WordNet equivalent for lemmatization.

    A list of Penn Treebank tags is available at:
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    """

    from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV

    pos = pos.lower()

    if pos.startswith('jj'):
        tag = ADJ
    elif pos == 'md':
        # Modal auxiliary verbs
        tag = VERB
    elif pos.startswith('rb'):
        tag = ADV
    elif pos.startswith('vb'):
        tag = VERB
    elif pos == 'wrb':
        # Wh-adverb (how, however, whence, whenever...)
        tag = ADV
    else:
        # default to NOUN
        # This is not strictly correct, but it is good
        # enough for lemmatization.
        tag = NOUN

    return tag

In [5]:
def lemmatize_string (some_string):               
    tokenized_line = word_tokenize(some_string)
    pos_tagged_line = pos_tag (tokenized_line)
    outstring = ''
    for pos_tagged_word in pos_tagged_line:
        word = pos_tagged_word[0]
        tag = pos_tagged_word[1]
        lemword = wordnet_lemmatizer.lemmatize(word, pt_to_wn (tag))
        outstring += lemword + " "
    return outstring

In [23]:
def add_lemmatized_abstracts(filepath, newpath):
    linecounter = 0
    new_file = codecs.open(newpath,'w','utf-8')
    with open(filepath) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            #print (type(row))
            linecounter += 1
            #print (linecounter)
            if linecounter % 100 == 0: 
                print (linecounter)
            #splitline = line.split ('№')
            abstract = row[4]
            lemm_abstract = lemmatize_string (abstract)
            #print (abstract)
            #print ('AND NOW LEMM:')
            #print (lemm_abstract)
            newline = '"' + '","'.join (row [:5]) + '","' + lemm_abstract+ '","' + '","'.join (row[5:]) + '"' + '\n'
            new_file.write (newline)
        #print (newline)
    #open_file.close()
    new_file.close()

In [24]:
filepath = '../data_examples/arxiv_data_small_301.csv'
newpath = '../data_examples/arxiv_data_small_301_lemm.csv'
#open_file = codecs.open(filepath,'r','utf-8')
add_lemmatized_abstracts (filepath, newpath)

100
200
300
