In [None]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import os
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer

# Download raw data from Hugging Face

In [None]:
# from datasets import load_dataset
# dataset = load_dataset("yahoo_answers_topics")
# dataset['train'].to_csv("raw_data/train.csv")
# dataset['test'].to_csv("raw_data/test.csv")

# Lemmatize sentence

In [3]:
def nltk_pos_tagger(nltk_tag):
    """Add tag about the grammatical category of each word"""
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    """Lemmatize sentence. The returned sentence contains letters only. Other characters are removed."""
    tokenizer = RegexpTokenizer(r'\w+') # include letters only r'[a-zA-Z]+'
    nltk_tagged = nltk.pos_tag(tokenizer.tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def save_obj(obj, name):
    """save as .pickle"""
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    """load .pickle"""
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
# Example sentence
lemmatizer = WordNetLemmatizer()
lemmatize_sentence("If the light had not been turned off, the house would have catched on fire.")

'If the light have not be turn off the house would have catch on fire'

In [10]:
def preprocessing_lemmatize(file_path, output_prefix, row_per_time):
    """Lemmatize and save sentences in batches."""
    skiprows = 0 # start from 0 (inclusive)
    # total number of rows
    num_lines = int(subprocess.check_output("wc -l "+file_path, shell=True).split()[0]) - 1
    print("Processing {} ({} lines): ".format(file_path, num_lines))

    lemmatizer = WordNetLemmatizer()
    while (skiprows < num_lines):
        print("{} - {}:".format(skiprows, skiprows+row_per_time), end=" ")
        # read in data
        raw_data = pd.read_csv(file_path, usecols=[2, 3, 4, 5], skiprows=skiprows, nrows=row_per_time)
        raw_data.columns = ['topic', 'question_title', 'question_content', 'best_answer']
        # get labels
        labels = raw_data.topic.values

        # get corpus
        corpus = []
        raw_data = raw_data.replace (np.nan, '.')
        raw_data['question_title'] = raw_data['question_title'].astype('string')
        raw_data['question_content'] = raw_data['question_content'].astype('string')
        raw_data['best_answer'] = raw_data['best_answer'].astype('string')
        for i in range(len(raw_data)):
            if (i+1) % 500 == 0:
                print(i+1, end=' ')
            sentence = raw_data.iloc[i,1] + " " + raw_data.iloc[i,2] + " " + raw_data.iloc[i,3]
            sentence = lemmatize_sentence(sentence)
            corpus.append(sentence)
        # save files
        save_obj(corpus, "{}_corpus_{}".format(output_prefix, skiprows))
        save_obj(labels, "{}_labels_{}".format(output_prefix, skiprows))
        # increament skiprows
        skiprows += row_per_time
        print()
        
    print("Finished!")

In [11]:
file_path = os.path.join("raw_data", "test.csv")
output_prefix = os.path.join("lemmatized_data", "test")
row_per_time = 5000
preprocessing_lemmatize(file_path, output_prefix, row_per_time)

Processing data\test.csv (60000 lines): 
0 - 5000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
5000 - 10000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
10000 - 15000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
15000 - 20000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
20000 - 25000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
25000 - 30000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
30000 - 35000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
35000 - 40000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
40000 - 45000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
45000 - 50000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
50000 - 55000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
55000 - 60000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
Finished!


In [12]:
file_path = os.path.join("raw_data", "train.csv")
output_prefix = os.path.join("lemmatized_data", "train")
row_per_time = 5000
preprocessing_lemmatize(file_path, output_prefix, row_per_time)

Processing data\train.csv (1400000 lines): 
0 - 5000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
5000 - 10000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
10000 - 15000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
15000 - 20000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
20000 - 25000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
25000 - 30000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
30000 - 35000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
35000 - 40000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
40000 - 45000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
45000 - 50000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
50000 - 55000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
55000 - 60000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
60000 - 65000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
65000 - 70000: 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 
70000 - 75000: 500 1000 1500 2000 2500 3000 3500 400

KeyboardInterrupt: 

Unnamed: 0,topic,question_title,question_content,best_answer
0,8,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...
1,1,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...
2,3,What did the itsy bitsy sipder climb up?,.,waterspout
3,3,What is the difference between a Bachelors and...,.,One difference between a Bachelors and a Maste...
4,2,Why do women get PMS?,.,Premenstrual syndrome (PMS) is a group of symp...
...,...,...,...,...
95,3,Hay um can some1 pleaseeee tell me a bone that...,plzzz i really need this fro a project!!!!!!!!,"The Xiphoid, attached to the sternum is bone w..."
96,6,Do you belive that the world can go through an...,.,"I think yes! due to the technology we have, al..."
97,7,Looking for Charlotte's Web DVD in Spanish? do...,.,"No, both the full-screen (ASIN: B00005N89B) an..."
98,4,"when i get some e mails, instead of a picture ...",.,it's b/c it's in HTML. at the bottom of the e-...
