# Imports

In [6]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [2]:
FILES_PATH = '../../Dataset/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset'
pattern = r'<TITLE>(.*?)</TITLE>.*?<TEXT>(.*?)</TEXT>'

# Part 1: Text Extraction

In [None]:
for filename in os.listdir(FILES_PATH):

    filepath = os.path.join(FILES_PATH, filename)
    with open(filepath, 'r') as f:
        contents = f.read()

    # Extracting the desired contents and concatenate them with a space
    # new_content = ' '.join(re.findall(pattern, content, flags=re.DOTALL)[0])
    title, text = re.findall(pattern, contents, flags=re.DOTALL)[0]
    new_contents = f'{title.strip()} {text.strip()}'

    # Saving the new contents in the same file
    # with open(filepath, 'w') as f:
    #     f.write(new_contents)

    # saving in alternate folder to avoid overwriting, uncomment the above lines to overwrite (just for testing)
    with open(os.path.join('../../Dataset/CSE508_Winter2023_Dataset', 'changed_files', filename), 'w') as f:
        f.write(new_contents)
    
    # Print the contents of 5 sample files before and after performing the operation
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('File:', filename)
        print('Before:', contents)
        print('After:', new_contents)
        print('----------------------------------------')

In [None]:
'''alternate code for extracting title and text'''

# # Iterating over all files in the folder
# for filename in os.listdir(FILES_PATH):

#     # if filename.startswith('cranfield'):
#     filepath = os.path.join(FILES_PATH, filename)
#     with open(filepath, 'r') as f:
#         contents = f.read()
    
#     # Extracting the contents in the title tag
#     title_start = contents.find('<TITLE>') + len('<TITLE>')
#     title_end = contents.find('</TITLE>', title_start)
#     title = contents[title_start:title_end].strip()
    
#     # Extracting the contents in the text tag
#     text_start = contents.find('<TEXT>') + len('<TEXT>')
#     text_end = contents.find('</TEXT>', text_start)
#     text = contents[text_start:text_end].strip()
    
#     # Concatenating the contents
#     new_contents = title + ' ' + text
    
#     # Save the new contents in the same file
#     # with open(filepath, 'w') as f:
#     #     f.write(new_contents)
    
#     # Printing the contents of 5 sample files before and after performing the operation
#     if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
#         print('File:', filename)
#         print('Before:', contents)
#         print('After:', new_contents)
#         print('----------------------------------------')

# Part 2: Text Preprocessing

In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
folder_path = '../../Dataset/CSE508_Winter2023_Dataset/changed_files/'

for filename in os.listdir(folder_path):

    filepath = os.path.join(folder_path, filename)
    with open(filepath, 'r') as f:
        contents = f.read()

    if int(filename[10:14]) in [1, 10, 100, 500, 1000]:
        print('File:', filename)
        print('Before:', contents)

    # Lowercasing the text
    contents = contents.lower()
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After lowercase:', contents)

    # Performing tokenization
    tokens = word_tokenize(contents)
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After tokenization:', tokens)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing stopwords:', tokens)

    # Removing punctuations
    tokens = [token for token in tokens if token not in punctuation]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing punctuations:', tokens)

    # Removing blank space tokens
    tokens = [token for token in tokens if token.strip()]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing blank space tokens:', tokens)
    
    new_contents = ' '.join(tokens)

    # Save the new contents in the same file
    with open(filepath, 'w') as f:
        f.write(new_contents)

    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('----------------------------------------')

# ----------------------------END OF FILE--------------------------------

In [None]:
ORIGINAL_PATH = os.path.join(os.getcwd(), os.pardir, os.pardir, 'Dataset', 'CSE508_Winter2023_Dataset', 'CSE508_Winter2023_Dataset')
ALTERED_PATH = os.path.join(os.getcwd(), 'DatasetAlter')

def create_alter(seed=1):

    random.seed(seed)
    random_samples = random.sample(range(1, 1401), 5)

    if not os.path.exists(ALTERED_PATH):
        os.makedirs(ALTERED_PATH)

    for filename in os.listdir(ORIGINAL_PATH):

        with open(os.path.join(ORIGINAL_PATH, filename), 'r') as f:
            original = f.read()

        soup = BeautifulSoup(original, 'lxml')
        content = soup.title.string.strip() + " " + soup.find('text').text.strip()

        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("Filename: ", filename)
            print("----------------------------------")
            print("Before: ")
            print(original)
            print("----------------------------------")
            print("After: ")
            print(content)
            print("----------------------------------")

        with open(os.path.join(ALTERED_PATH, filename), 'w') as fa:
            fa.write(content)

    print("Finished Processing")

In [None]:
def clear_alter():
    for filename in os.listdir(ALTERED_PATH):
        os.remove(os.path.join(ALTERED_PATH, filename))

In [None]:
clear_alter()
create_alter()

In [None]:
def preprocess(seed=1):
    random.seed(seed)
    random_samples = random.sample(range(1, 1401), 5)

    for filename in os.listdir(ALTERED_PATH):
        with open(os.path.join(ALTERED_PATH, filename), 'r') as f:
            original = f.read()

        content = original.lower()


        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("LOWERCASE")
            print("----------------------------------")
            print("Filename: ", filename)
            print("----------------------------------")
            print("Before: ")
            print(original)
            print("----------------------------------")
            print("After: ")
            print(content)
            print("----------------------------------")

        content = word_tokenize(content)


        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("TOKENIZE")
            print("After: ")
            print(content)
            print("----------------------------------")

        content = [w for w in content if not w in stopwords.words('english')]


        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("STOPWORDS")
            print("After: ")
            print(content)
            print("----------------------------------")

        content = [w for w in content if not w in punctuation]


        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("PUNCTUATION")
            print("After: ")
            print(content)
            print("----------------------------------")

        content = [w for w in content if w.strip()]


        if int(filename[-4:]) in random_samples:
            print("----------------------------------")
            print("BLANKSPACE")
            print("After: ")
            print(content)
            print("----------------------------------")

        content = " ".join(content)

        with open(os.path.join(ALTERED_PATH, filename), 'w') as fa:
            fa.write(content)

    print("Finished Processing")


In [None]:
preprocess()