In [None]:
pip install warcio

Extract text from a single WARC file and store it as a .txt file; this code does not remove HTML tags.

In [None]:
from warcio.archiveiterator import ArchiveIterator
import os
import html

def extract_text(record):
    if record.rec_type == 'response':
        if record.http_headers:
            content_type = record.http_headers.get_header('Content-Type', '').lower()
            if 'text/html' in content_type:
                html_content = record.content_stream().read().decode('utf-8', 'replace')
                text = html.unescape(html_content)
                return text
    return None

def extract_text_from_warc(warc_filename, output_folder):
    output_file = os.path.join(output_folder, f"output_warc.txt")
    try:
        with open(warc_filename, 'rb') as stream:
            for record in ArchiveIterator(stream):
                text = extract_text(record)
                if text:
                    with open(output_file, 'a', encoding='utf-8') as outfile:
                        outfile.write(text)
    except Exception as e:
                print(f"Error processing {warc_filename}: {e}")

In [None]:
warc_path = 'path/to/WARC' # Path to where your WARC is.
folder_output = 'path/to/output_folder' # Path to the folder where you wish to save your .txt.
extract_text_from_warc(warc_path, folder_output)

Extract text from a single WARC file and store it as a .txt file; this code removes all HTML tags and normalizes the text. 

In [None]:
from warcio.archiveiterator import ArchiveIterator
import os
import html
import re
import string

def extract_text_no_html(record):
    if record.rec_type == 'response':
        if record.http_headers:
            content_type = record.http_headers.get_header('Content-Type', '').lower()
            if 'text/html' in content_type:
                html_content = record.content_stream().read().decode('utf-8', 'replace')
                text = re.sub(r'<script[\s\S]+?<\/script>', '', html_content)
                text = re.sub(r'<style[\s\S]+?<\/style>', '', text)
                text = re.sub(r'<[^>]+?>', '', text)
                text = html.unescape(text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip()
    return None

def normalize(text):
        
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text_with_spaces = text.translate(translator)
    text_lowercase = text_with_spaces.lower()

    words = text_lowercase.split()

    filtered_words = [word for word in words if len(word) > 2]

    normalized_text = ' '.join(filtered_words)

    return normalized_text

def extract_text_normalized_from_warc(warc_filename, output_folder):
    output_file = os.path.join(output_folder, f"output_warc.txt")
    try:
        with open(warc_filename, 'rb') as stream:
            for record in ArchiveIterator(stream):
                text = extract_text_no_html(record)
                normalized_text = normalize(text)
                if text:
                    with open(output_file, 'a', encoding='utf-8') as outfile:
                        outfile.write(normalized_text + '\n')
    except Exception as e:
                print(f"Error processing {warc_filename}: {e}")

In [None]:
warc_path = 'path/to/WARC' # Path to where your WARC is.
folder_output = 'path/to/output_folder' # Path to the folder where you wish to save your .txt.
extract_text_normalized_from_warc(warc_path, folder_output)

Extract text from a folder of WARCs and store it all in the same txt file; this code removes all HTML tags and normalizes the text. It also gets rid of punctuation.

In [None]:
from warcio.archiveiterator import ArchiveIterator
import os
import html
import re
import string

def extract_text_no_html(record):
    if record.rec_type == 'response':
        if record.http_headers:
            content_type = record.http_headers.get_header('Content-Type', '').lower()
            if 'text/html' in content_type:
                html_content = record.content_stream().read().decode('utf-8', 'replace')
                text = re.sub(r'<script[\s\S]+?<\/script>', '', html_content)
                text = re.sub(r'<style[\s\S]+?<\/style>', '', text)
                text = re.sub(r'<[^>]+?>', '', text)
                text = html.unescape(text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip()
    return None

def normalize(text):
        
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text_with_spaces = text.translate(translator)
    text_lowercase = text_with_spaces.lower()

    words = text_lowercase.split()

    filtered_words = [word for word in words if len(word) > 2]

    normalized_text = ' '.join(filtered_words)

    return normalized_text

def extract_text_normalized_from_folder(input_folder, output_folder):
    output_counter = 1
    for filename in os.listdir(input_folder):
        if filename.endswith('.warc') or filename.endswith('.gz'):
            output_file = os.path.join(output_folder, f"output_{output_counter}.txt")
            output_counter += 1
            file_path = os.path.join(input_folder, filename)
            try:
                with open(file_path, 'rb') as stream, open(output_file, 'w', encoding='utf-8') as outfile:
                    for record in ArchiveIterator(stream):
                        text = extract_text_no_html(record)
                        normalized_text = normalize(text)
                        if text:
                            outfile.write(text + '\n')
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

In [None]:
folder_input = 'path/to/WARCs_folder'   # Path to the folder where your WARCs are.
folder_output = 'path/to/output_folder' # Path to the folder where you wish to save your .txt.
extract_text_normalized_from_folder(folder_input, folder_output)