In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import re
import nltk

nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/giangvdq/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/giangvdq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#PHASE 1: PRE-PROCESSING WITH PANDAS

data_path = "/home/giangvdq/data/NIPS Papers"
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        if 'papers.csv' in filename:
            papers_path = os.path.join(dirname, filename)

print(papers_path)

/home/giangvdq/data/NIPS Papers/papers.csv


In [5]:
data = pd.read_csv(papers_path)

In [6]:
# STEP 1: REMOVE SPECIAL CHARACTERS USING REGEX RE LIBRARY
def cleaned_text(text): 
    clean = re.sub("\n"," ",text)
    clean=clean.lower()
    clean=re.sub(r"[~.,%/:;?_&+*=!-]"," ",clean)
    clean=re.sub("[^a-z]"," ",clean)
    clean=clean.lstrip()
    clean=re.sub("\s{2,}"," ",clean)
    return clean

data["cleaned_paper_text"]=data["paper_text"].apply(cleaned_text)

In [7]:
# STEP 2: REMOVING INVALID WORDS THAT HAVE FEWER THAN 3 CHARACTERS
data["cleaned_paper_text"] = data["cleaned_paper_text"].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3]))

In [8]:
# STEP 3: REMOVE STOPWORDS AND JOIN TO STRING
stop=stopwords.words('english')

manual_stopwords = ["also", 'model', 'use', 'function', 'learn', 'show', 'result']

stop.extend(manual_stopwords)

data["stop_removed_paper_text"] = (data["cleaned_paper_text"]
.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])))

In [9]:
# STEP 4: TOKENIZE TEXT COLUMN
data["tokenized"]=data["stop_removed_paper_text"].apply(lambda x: nltk.word_tokenize(x))

In [10]:
# STEP 5: LEMMATIZE THE VERBS AND OUTPUT STRING
def word_lemmatizer_verb(text):
    lem_text = [WordNetLemmatizer().lemmatize(i,pos='v') for i in text]
    return lem_text

def word_lemmatizer_noun(text):
    lem_text = [WordNetLemmatizer().lemmatize(i,pos='n') for i in text]
    return lem_text


data["lemmatized"] = data["tokenized"].apply(lambda x: word_lemmatizer_verb(x))
data["lemmatized"] = data["lemmatized"].apply(lambda x: word_lemmatizer_noun(x))

data["lemmatize_joined"] = data["lemmatized"].apply(lambda x: ' '.join(x))

In [11]:
data.to_csv(path_or_buf = '/home/giangvdq/data/NIPS Papers/papers_processed.csv', 
            columns = ['id', 'lemmatize_joined'], 
            index = False)