In [None]:
import lxml.html
from lxml.etree import tostring

filename = 'texts.html'
with open(filename, 'r') as file:  # Use a context manager for automatic file closing
    lines = file.readlines()
    string = ''.join(lines)
    html = lxml.html.fromstring(string)
print(html)

# All Helper Functions

In [None]:
def getTimeStamp(element):
    els = element.find_class('timestamp')

    if els:
      return els[0].text_content()
    else:
      return ''

def getSender(element):
    els = element.find_class('sender')

    if els:
      return els[0].text_content()
    else:
      return ''

def getMessage(element):
    els = element.find_class('bubble')

    if els:
      return els[0].text_content()
    else:
      return ''

def getReaction(element):
    els = element.find_class('reaction')

    if els:
      return els[0].text_content()
    else:
      return ''

def getEditedElement(element):
    els = element.find_class('edited')

    if els:
      return els[0].text_content()
    else:
      return ''

def hasAttachments(element):
    els = element.find_class('attachment')

    if els:
      return True
    else:
      return False

def getAttachmentLinks(element):
    image_element = element.find_class('attachment')

    if image_element:
      l = []
      for element in image_element:
        img = element.find('img')
        video_element = element.find('.//video')
        audio_element = element.find('.//audio')
        attachment = element.find('a')

        if img is not None:
          l.append(img.get('src'))
        elif video_element is not None:
          source_element = video_element.find('.//source')
          l.append(source_element.get('src'))
        elif attachment is not None:
          l.append(attachment.get('href'))
        elif audio_element is not None:
          l.append(audio_element.get('src'))
        else:
          print(tostring(element))

      return l
    else:
      return ''

def getReplyAnchor(element):
    els = element.find_class('reply_anchor')

    if els:
      return els[0].find('a').get('href')
    else:
      return ''

def getAppSent(element):
    els = element.find_class('app')

    if els:
      return els[0].text_content()
    else:
      return ''

In [None]:
class Message:
    def __init__(self, timestamp, sender, message, reaction, edits, attachmentLinks, replyAnchor, appSent):
        self.timestamp = timestamp
        self.sender = sender
        self.message = message
        self.reaction = reaction
        self.edits = edits
        self.attachmentLinks = attachmentLinks
        self.replyAnchor = replyAnchor
        self.appSent = appSent

    def toDict(self):
        return {
            'timestamp': self.timestamp,
            'sender': self.sender,
            'message': self.message,
            'reaction': self.reaction,
            'edits': self.edits,
            'attachmentLinks': self.attachmentLinks,
            'replyAnchor': self.replyAnchor,
            'appSent': self.appSent
        }

# Parse All Messages

In [None]:
messages = html.find_class('message')

message_list = []

for message in messages:
    message_list.append(Message(
        getTimeStamp(message),
        getSender(message),
        getMessage(message),
        getReaction(message),
        getEditedElement(message),
        getAttachmentLinks(message),
        getReplyAnchor(message),
        getAppSent(message)
        ))

# Import Into Dataframe and Save to CSV

In [None]:
import pandas as pd
df = pd.DataFrame(x.toDict() for x in message_list)

In [None]:
df.to_csv('messages.csv', index=False)