In [None]:
import os
import re
import xml.etree.ElementTree as ET
import shutil

In [None]:
def clean_xml_text(xml_content):

   # Remove XML tags
   cleaned_text = re.sub(r"<[^>]+>", "", xml_content)
   
   # Replace multiple spaces/newlines with a single space
   cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
   
   return cleaned_text

def process_xml_files(input_folder, output_folder):

   if not os.path.exists(output_folder):
      os.makedirs(output_folder)  

   for filename in os.listdir(input_folder):
      if filename.endswith(".xml"):
         input_path = os.path.join(input_folder, filename)
         output_path = os.path.join(output_folder, filename.replace(".xml", ".txt"))

         # Read the XML file
         with open(input_path, "r", encoding="utf-8") as file:
            raw_content = file.read()

         # Clean the content
         cleaned_content = clean_xml_text(raw_content)

         # Save the cleaned content
         with open(output_path, "w", encoding="utf-8") as output_file:
            output_file.write(cleaned_content)
            
         print(f"Processed: {filename} -> {output_path}")


input_folder = r"D:\DS-B3\NLP\xml_data\dataset" 
output_folder = r"D:\DS-B3\NLP\xml_data\data_txt" 

process_xml_files(input_folder, output_folder)

Processed: 10015.xml -> D:\DS-B3\NLP\xml_data\data_txt\10015.txt
Processed: 10025.xml -> D:\DS-B3\NLP\xml_data\data_txt\10025.txt
Processed: 10027.xml -> D:\DS-B3\NLP\xml_data\data_txt\10027.txt
Processed: 10029.xml -> D:\DS-B3\NLP\xml_data\data_txt\10029.txt
Processed: 10030.xml -> D:\DS-B3\NLP\xml_data\data_txt\10030.txt
Processed: 10033.xml -> D:\DS-B3\NLP\xml_data\data_txt\10033.txt
Processed: 10042.xml -> D:\DS-B3\NLP\xml_data\data_txt\10042.txt
Processed: 10046.xml -> D:\DS-B3\NLP\xml_data\data_txt\10046.txt
Processed: 10047.xml -> D:\DS-B3\NLP\xml_data\data_txt\10047.txt
Processed: 10049.xml -> D:\DS-B3\NLP\xml_data\data_txt\10049.txt
Processed: 10106.xml -> D:\DS-B3\NLP\xml_data\data_txt\10106.txt
Processed: 10135.xml -> D:\DS-B3\NLP\xml_data\data_txt\10135.txt
Processed: 10185.xml -> D:\DS-B3\NLP\xml_data\data_txt\10185.txt
Processed: 10186.xml -> D:\DS-B3\NLP\xml_data\data_txt\10186.txt
Processed: 10214.xml -> D:\DS-B3\NLP\xml_data\data_txt\10214.txt
Processed: 10226.xml -> D

In [None]:
def move_small_text_files(input_folder, output_folder, word_limit=2000):
   """
   Reads all .txt files in the input_folder.
   Moves files with word count <= word_limit to the output_folder.
   """
   if not os.path.exists(output_folder):
      os.makedirs(output_folder)  

   for filename in os.listdir(input_folder):
      if filename.endswith(".txt"):  
         file_path = os.path.join(input_folder, filename)

         # Read file and count words
         with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            word_count = len(text.split())

         # Move file if it has 2000 words or fewer
         if word_count <= word_limit:
            shutil.move(file_path, os.path.join(output_folder, filename))
            print(f"Moved: {filename} ({word_count} words)")

In [None]:
input_folder = r"D:\DS-B3\NLP\xml_data\data_txt"  
output_folder = r"D:\DS-B3\NLP\xml_data\under2k"  
move_small_text_files(input_folder, output_folder)

Moved: 10015.txt (1615 words)
Moved: 10047.txt (1904 words)
Moved: 10106.txt (1962 words)
Moved: 10135.txt (14 words)
Moved: 10185.txt (37 words)
Moved: 10186.txt (1588 words)
Moved: 10317.txt (1284 words)
Moved: 10418.txt (708 words)
Moved: 10461.txt (1632 words)
Moved: 10564.txt (1468 words)
Moved: 10573.txt (1765 words)
Moved: 10576.txt (1915 words)
Moved: 10599.txt (64 words)
Moved: 10602.txt (33 words)
Moved: 10605.txt (277 words)
Moved: 10608.txt (1360 words)
Moved: 10609.txt (1179 words)
Moved: 10610.txt (1037 words)
Moved: 10611.txt (967 words)
Moved: 10613.txt (69 words)
Moved: 10614.txt (238 words)
Moved: 10617.txt (1569 words)
Moved: 10620.txt (1276 words)
Moved: 10646.txt (570 words)
Moved: 10653.txt (290 words)
Moved: 10659.txt (1468 words)
Moved: 10663.txt (1484 words)
Moved: 10669.txt (370 words)
Moved: 10671.txt (1012 words)
Moved: 10672.txt (809 words)
Moved: 10675.txt (34 words)
Moved: 10677.txt (24 words)
Moved: 10678.txt (821 words)
Moved: 10692.txt (77 words)
Moved