In [2]:
import os
import re
import string

import nltk
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
from nltk.stem import PorterStemmer

from bs4 import BeautifulSoup

In [3]:
test_text = """sees ran run <p><a href="http://forums.wpcentral.com/radar/188319.htm" rel="nofollow">This forum 123 thread</a> mentions that all 1st gen devices could do that, and Dave Blake confirms that all Mango devices do support that.</p>

<p><a href="http://www.microsoft.com/southafrica/windowsphone/handsets.html" rel="nofollow">This page from Microsoft</a> has three 1st gen devices that mention A2DP support. I guess throwing a device name together with <code>A2DP</code> and <code>specifications</code> could help confirm if your device does, but as far as I can see all devices do support this.</p>"""

def get_pre_processed(text):
#     print("\n----RAW----\n",text)
    
    # remove html tags
    text = BeautifulSoup(text).text
#     print("\n\n----html tags removed----\n",text)
    
    # Lowercase the text
    text = text.lower()
#     print("\n===After Lowercase:===\n", text)
    
    # Removing Numbers
    text = re.sub(r'[-+]?\d+\b', '', text)
#     print("\n===After Removing Numbers:===\n", text)

    # Remove punctuations
    text = text.translate((str.maketrans('','',string.punctuation)))
#     print("\n===After Removing Punctuations:===\n", text)
    
    #Tokenize
    text = word_tokenize(text)
#     print("\n===After Tokenizing:===\n", text)
    
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
#     print("\n===After Stopword Removal:===\n", text)
    
    #Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
#     print("\n===After Lemmatization:===\n", text)
    
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
#     print("\n===After Stemming:===\n", text)

    return text


# print("\n===Before Pre Processing:===\n",test_text)
# test_text = get_pre_processed(test_text)
# # print("\n===After Pre Processing:===\n",test_text)
# print("\n===After Pre Processing:===\n"," ".join(test_text))

In [4]:
def store_train_validation_test(DATA_DIR,topics,n_train,n_validation,n_test):
    train = open("train.in",'w',encoding='utf16')
    validation = open("validation.in",'w',encoding='utf16')
    test = open("test.in",'w',encoding='utf16')
    
    for topic in topics:
        xml_file = os.path.join(DATA_DIR,f"{topic}.xml")
        file = open(xml_file,'rb')
        content = file.read()
        soup = BeautifulSoup(content,'xml')

        num = 1
        for items in soup.findAll("row"):
            body = items.attrs['Body']
            if len(body) == 0:
                continue
                # print("empty",items)
            
            text = get_pre_processed(body)
            text.append(topic)
            doc = " ".join(text)
            if num <= n_train:
                # train.write(f"------{items.attrs['Id']}------\n")
                print(doc,file=train)
            elif n_train < num and num <= (n_train + n_validation):
                # validation.write(f"------{items.attrs['Id']}------\n")
                print(doc,file=validation)
            elif (n_train + n_validation) < num and num <= (n_train + n_validation + n_test):
                # test.write(f"------{items.attrs['Id']}------\n")
                print(doc,file=test)
            else:
                break
            num += 1
        file.close()
        
    train.close()
    validation.close()
    test.close()

In [5]:
# Paths
DATA_DIR = os.path.join(os.path.join(os.getcwd(),"Data"),"Training") 
print(DATA_DIR)

topics_txt = os.path.join(os.path.join(os.getcwd(),"Data"),"topics.txt")
print(topics_txt)

C:\Users\ilove\Documents\ml_offline_2\Data\Training
C:\Users\ilove\Documents\ml_offline_2\Data\topics.txt


In [6]:
# read the topics' name
with open(topics_txt, 'r') as f:
    topics = [topic.strip() for topic in f.readlines()]    
topics

['Coffee', 'Arduino', 'Windows_Phone']

In [8]:
# store_train_validation_test(os.getcwd(),['Sample','Sample2'],5,2,5)
store_train_validation_test(DATA_DIR,topics,500,100,500)