# Imports

In [1]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [2]:
FILES_PATH = os.path.join(os.getcwd(), os.pardir, os.pardir, 'Dataset')
pattern = r'<TITLE>(.*?)</TITLE>.*?<TEXT>(.*?)</TEXT>'

# Part 1: Text Extraction

In [3]:
for filename in os.listdir(FILES_PATH):

    filepath = os.path.join(FILES_PATH, filename)
    with open(filepath, 'r') as f:
        contents = f.read()

    # Extracting the desired contents and concatenate them with a space
    # new_content = ' '.join(re.findall(pattern, content, flags=re.DOTALL)[0])
    title, text = re.findall(pattern, contents, flags=re.DOTALL)[0]
    new_contents = f'{title.strip()} {text.strip()}'

    # Saving the new contents in the same file
    with open(filepath, 'w') as f:
        f.write(new_contents)

    # saving in alternate folder to avoid overwriting, uncomment the above lines to overwrite (just for testing)
    # with open(os.path.join('../../Dataset/CSE508_Winter2023_Dataset', 'changed_files', filename), 'w') as f:
    #     f.write(new_contents)
    
    # Print the contents of 5 sample files before and after performing the operation
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('File:', filename)
        print('Before:', contents)
        print('After:', new_contents)
        print('----------------------------------------')

File: cranfield0010
Before: <DOC>
<DOCNO>
10
</DOCNO>
<TITLE>
the theory of the impact tube at low pressure .
</TITLE>
<AUTHOR>
chambre,p.l. and schaaf,s.a.
</AUTHOR>
<BIBLIO>
j. ae. scs. 15, 1948, 735.
</BIBLIO>
<TEXT>
  a theoretical analysis has been made for an impact tube of the
relation between free-stream mach number and the impact and
free-stream pressures and densities for extremely low pressures .
it is shown that the results differ appreciably from the corresponding
continuum relations .
</TEXT>
</DOC>

After: the theory of the impact tube at low pressure . a theoretical analysis has been made for an impact tube of the
relation between free-stream mach number and the impact and
free-stream pressures and densities for extremely low pressures .
it is shown that the results differ appreciably from the corresponding
continuum relations .
----------------------------------------
File: cranfield0050
Before: <DOC>
<DOCNO>
50
</DOCNO>
<TITLE>
investigation of laminar boundary layer 

In [None]:
'''alternate code for extracting title and text'''

# # Iterating over all files in the folder
# for filename in os.listdir(FILES_PATH):

#     # if filename.startswith('cranfield'):
#     filepath = os.path.join(FILES_PATH, filename)
#     with open(filepath, 'r') as f:
#         contents = f.read()
    
#     # Extracting the contents in the title tag
#     title_start = contents.find('<TITLE>') + len('<TITLE>')
#     title_end = contents.find('</TITLE>', title_start)
#     title = contents[title_start:title_end].strip()
    
#     # Extracting the contents in the text tag
#     text_start = contents.find('<TEXT>') + len('<TEXT>')
#     text_end = contents.find('</TEXT>', text_start)
#     text = contents[text_start:text_end].strip()
    
#     # Concatenating the contents
#     new_contents = title + ' ' + text
    
#     # Save the new contents in the same file
#     # with open(filepath, 'w') as f:
#     #     f.write(new_contents)
    
#     # Printing the contents of 5 sample files before and after performing the operation
#     if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
#         print('File:', filename)
#         print('Before:', contents)
#         print('After:', new_contents)
#         print('----------------------------------------')

# Part 2: Text Preprocessing

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praty\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# folder_path = '../../Dataset/CSE508_Winter2023_Dataset/changed_files/'

for filename in os.listdir(FILES_PATH):

    filepath = os.path.join(FILES_PATH, filename)
    with open(filepath, 'r') as f:
        contents = f.read()

    if int(filename[10:14]) in [1, 10, 100, 500, 1000]:
        print('File:', filename)
        print('Before:', contents)

    # Lowercasing the text
    contents = contents.lower()
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After lowercase:', contents)

    # Performing tokenization
    tokens = word_tokenize(contents)
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After tokenization:', tokens)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing stopwords:', tokens)

    # Removing punctuations
    tokens = [token for token in tokens if token not in punctuation]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing punctuations:', tokens)

    # Removing blank space tokens
    tokens = [token for token in tokens if token.strip()]
    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('After removing blank space tokens:', tokens)
    
    new_contents = ' '.join(tokens)

    # Save the new contents in the same file
    # with open(filepath, 'w') as f:
    #     f.write(new_contents)

    if int(filename[10:14]) in [10, 50, 100, 500, 1000]:
        print('----------------------------------------')

File: cranfield0001
Before: experimental investigation of the aerodynamics of a
wing in a slipstream . an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with supporting
evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or boundary-layer-control
effect .  the integrated remaining lift increment,
after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluation of the destalling effects was made for
the specific configuration of the experiment .
File: cranfield0010
Before: the theory of the impact tube at lo