<a href="https://colab.research.google.com/github/HitheshJain2002/BERT_Calender_Integrate/blob/main/Email%20Priortization%20using%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from html.parser import HTMLParser
from email.header import Header, decode_header
import mailbox
import base64
import quopri
import re
import sys
import html2text



""" ____Format utils____ """

class MLStripper(HTMLParser):
    """MLStripper
    Strip HTML from strings in Python
    https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    """
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)



def strip_tags(html):
    """
    Use MLStripper class to strip HMTL from string
    """
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def strip_payload(payload):
    """
    Remove carriage returns and new lines
    """
    return payload.replace('\r', ' ').replace('\n', ' ')


def encoded_words_to_text(encoded_words):
    """
    Not used, left for reference only
    https://dmorgan.info/posts/encoded-word-syntax/
    """
    encoded_word_regex = r'=\?{1}(.+)\?{1}([B|Q])\?{1}(.+)\?{1}='
    # encoded_word_regex = r'=\?{1}.+\?{1}[B|Q|b|q]\?{1}.+\?{1}='
    charset, encoding, encoded_text = re.match(encoded_word_regex, encoded_words, re.IGNORECASE).groups()
    if encoding.upper() == 'B':
        byte_string = base64.b64decode(encoded_text)
    elif encoding.upper() == 'Q':
        byte_string = quopri.decodestring(encoded_text)
    return byte_string.decode(charset)



""" ____Custom Message class____ """

class CustomMessage():
    """
    The CusomMessage class represents an email message with three fields:
    - :body:
    - :subject:
    - :content_type: (document, plain text, HTML, image...)
    """
    def __init__(self, body, subject, content_type):

        """
        Constructor
        It tries to find the subject's encoding and decode it accordingly
        It decodes the body based on the content type
        """
        self.content_type = content_type

        # Decode subject if encoded in utf-8
        if isinstance(subject, Header):
            subject = decode_header(subject)[0][0].decode('utf-8')

        # The subject can have several parts encoded in different formats
        # These parts are flagged with strings like '=?UTF-8?'
        if subject is not None and ('=?ISO-' in subject.upper() or '=?UTF-8?' in subject.upper()):
            self.subject = ''
            for subject_part in decode_header(subject):
                # Decode each part based on its encoding
                # The encoding could be returnd by the "decode_header" function
                if subject_part[1] is None:
                    self.subject += strip_payload(subject_part[0].decode())
                else:
                    self.subject += strip_payload(subject_part[0].decode(subject_part[1]))
        elif subject is None:
            # Empty subject
            self.subject = ''
        else:
            # Subject is not encoded or other corner cases that are not considered
            self.subject = strip_payload(subject)

        # Body decoding
        if 'text' in self.content_type:
            # Decode text messages
            try:
                decoded_body = body.decode('utf-8')
            except UnicodeDecodeError:
                decoded_body = body.decode('latin-1')

            if 'html' in self.content_type:
                # If it is an HTML message, remove HTML tags
                h = html2text.HTML2Text()

                h.ignore_links = True
                h.ignore_tables = True
                h.ignore_images = True
                h.ignore_anchors = True
                h.ignore_emphasis = True

                self.body = strip_payload(h.handle(decoded_body))
            else:
                self.body = strip_payload(decoded_body)
        else:
            # If not text, return the body as it is
            self.body = body

    def __str__(self):
        body_length = 2000
        printed_body = self.body[:body_length]
        if 'text' in self.content_type:
            # Shorten long message bodies
            if len(self.body) > body_length:
                printed_body += "..."
        return " ---- Custom Message ---- \n  -- Content Type: {}\n  -- Subject: {}\n  -- Body --\n{}\n\n".format(self.content_type, self.subject, printed_body)

    def get_body(self):
        return self.body

    def get_subject(self):
        return self.subject

    def get_content_type(self):
        return self.content_type

    def create_vector_line(self, label):
        """
        Creates a CSV line with the message's body and given :label:
        Removes any commas from body and label
        """
        return '{body},{label}'.format(body=self.body.replace(',', ''), label=label)

    @staticmethod
    def extract_types_from_messages(messages):
        """
        Takes a list of CustomMessage and extracts all the existing values for content_type
        ['application/ics', 'application/octet-stream', 'application/pdf', 'image/gif', 'image/jpeg',
        'image/png', 'text/calendar', 'text/html', 'text/plain', 'text/x-amp-html']
        """
        types = set()
        for m in messages:
            types.add(m.get_content_type())
        return sorted(types)



""" ____Extraction utils____ """

def extract_message_payload(mes, parent_subject=None):
    """
    Extracts recursively the payload of the messages contained in :mes:
    When a message is embedded in another, it uses the parameter :parent_subject:
    to set the subject properly (it uses the parent's subject)
    """
    extracted_messages = []
    if mes.is_multipart():
        if parent_subject is None:
            subject_for_child = mes.get('Subject')
        else:
            subject_for_child = parent_subject
        for part in mes.get_payload():
            extracted_messages.extend(extract_message_payload(part, subject_for_child))
    else:
        extracted_messages.append(CustomMessage(mes.get_payload(decode=True), parent_subject,  mes.get_content_type()))
    return extracted_messages


def text_messages_to_string(mes):
    """
    Returns the email's body extracted from :mes: as a string.
    Ignores images and documents.
    :mes: should be a list of CustomMessage objects.
    """
    output = ''
    for m in mes:
        if m.get_content_type().startswith('text'):
            output += str(m)
    return output


def create_classification_line(mes, label):
    """
    Creates CSV line(s) with two columns: the email's body extracted from :mes:
    and its classification (:label:)
    Ignores images, documents and calendar messages.
    :mes: should be a list of CustomMessage objects.
    """
    output = ''
    for m in mes:
        if m.get_content_type().startswith('text') and m.get_content_type() != 'text/calendar':

            body = m.get_body().replace(',', '')
            output += '{},{},{},{}\n'.format(body, m.from_address, m.date, label)
    return output


def to_file(text, file):
    """
    Writes :text: to :file:
    """
    f = open(file, 'w')
    f.write(text)
    f.close


def extract_mbox_file(file):
    """
    Extracts all the messages included in an mbox :file:
    by calling extract_message_payload
    """
    mbox = mailbox.mbox(file)
    messages = []
    for message in mbox:
        messages.extend(extract_message_payload(message))
    return messages



if __name__ == '__main__':
#     argv = sys.argv
#     if len(argv) != 2:
#         print('Invalid arguments')
#     else:
#         file = argv[1]
        file='testt.mbox'
        messages = extract_mbox_file(file)


# # Call to create a CSV file with the extracted data (body + label)
# to_file(create_classification_line(messages, 'label'), file + '_features.csv')
# # Call to export all the extracted data
# to_file(text_messages_to_string(messages), file + '_full_extract')


In [None]:
to_file(create_classification_line(messages, 'label'), file + '_featureees.csv')

In [None]:
%run dede.py testt.mbox

Extracting messages:   1%|▏                    | 8/1114 [00:00<00:14, 77.91it/s]

yes
yes
yes
yes
yes
yes
yes
yes


Extracting messages:   1%|▎                   | 16/1114 [00:00<00:33, 32.79it/s]

yes
yes
yes


Extracting messages:   4%|▊                   | 46/1114 [00:01<00:21, 49.72it/s]

yes
yes


Extracting messages:   6%|█▎                  | 71/1114 [00:01<00:15, 69.15it/s]

yes
yes


Extracting messages:   7%|█▍                  | 80/1114 [00:02<00:32, 32.21it/s]

yes


Extracting messages:   8%|█▌                  | 87/1114 [00:02<00:50, 20.24it/s]

yes
yes
yes


Extracting messages:  11%|██                 | 118/1114 [00:03<00:30, 33.00it/s]

yes


Extracting messages:  14%|██▌                | 151/1114 [00:04<00:19, 50.66it/s]

yes


Extracting messages:  16%|███                | 182/1114 [00:04<00:14, 62.43it/s]

yes
yes
yes
yes
yes
yes


Extracting messages:  19%|███▌               | 207/1114 [00:05<00:13, 68.70it/s]

yes


Extracting messages:  19%|███▋               | 215/1114 [00:05<00:16, 55.87it/s]

yes
yes


Extracting messages:  22%|████               | 241/1114 [00:05<00:16, 53.31it/s]

yes


Extracting messages:  22%|████▏              | 249/1114 [00:06<00:15, 55.66it/s]

yes
yes


Extracting messages:  24%|████▍              | 263/1114 [00:06<00:23, 36.64it/s]

yes


Extracting messages:  25%|████▋              | 275/1114 [00:07<00:34, 24.11it/s]

yes
yes


Extracting messages:  25%|████▊              | 283/1114 [00:07<00:33, 24.80it/s]

yes


Extracting messages:  27%|█████              | 299/1114 [00:09<00:41, 19.80it/s]

yes


Extracting messages:  30%|█████▋             | 334/1114 [00:09<00:22, 34.56it/s]

yes


Extracting messages:  36%|██████▊            | 402/1114 [00:10<00:10, 68.62it/s]

yes
yes
yes
yes
yes


Extracting messages:  38%|███████▎           | 427/1114 [00:11<00:13, 52.29it/s]

yes
yes
yes
yes
yes


Extracting messages:  41%|███████▋           | 454/1114 [00:11<00:11, 56.34it/s]

yes
yes


Extracting messages:  41%|███████▊           | 460/1114 [00:11<00:11, 56.31it/s]

yes


Extracting messages:  43%|████████           | 474/1114 [00:12<00:15, 42.37it/s]

yes
yes
yes
yes


Extracting messages:  43%|████████▏          | 482/1114 [00:12<00:20, 31.05it/s]

yes


Extracting messages:  48%|█████████          | 534/1114 [00:14<00:14, 38.67it/s]

yes


Extracting messages:  49%|█████████▏         | 541/1114 [00:14<00:19, 28.78it/s]

yes


Extracting messages:  49%|█████████▎         | 545/1114 [00:15<00:29, 19.18it/s]

yes


Extracting messages:  51%|█████████▋         | 568/1114 [00:16<00:18, 29.18it/s]

yes
yes
yes


Extracting messages:  52%|█████████▉         | 580/1114 [00:16<00:18, 28.56it/s]

yes
yes
yes


Extracting messages:  54%|██████████▏        | 596/1114 [00:17<00:17, 30.31it/s]

yes


Extracting messages:  54%|██████████▎        | 606/1114 [00:17<00:22, 22.72it/s]

yes


Extracting messages:  55%|██████████▍        | 610/1114 [00:17<00:20, 24.27it/s]

yes
yes
yes


Extracting messages:  57%|██████████▉        | 639/1114 [00:18<00:13, 35.65it/s]

yes
yes
yes


Extracting messages:  60%|███████████▍       | 668/1114 [00:19<00:06, 69.52it/s]

yes
yes


Extracting messages:  62%|███████████▊       | 691/1114 [00:19<00:06, 70.42it/s]

yes


Extracting messages:  63%|███████████▉       | 701/1114 [00:19<00:07, 52.32it/s]

yes


Extracting messages:  66%|████████████▌      | 737/1114 [00:20<00:05, 69.55it/s]

yes
yes
yes
yes


Extracting messages:  67%|████████████▋      | 746/1114 [00:20<00:06, 52.95it/s]

yes
yes


Extracting messages:  68%|████████████▉      | 759/1114 [00:21<00:15, 22.70it/s]

yes
yes


Extracting messages:  69%|█████████████      | 768/1114 [00:22<00:15, 22.42it/s]

yes


Extracting messages:  73%|█████████████▊     | 809/1114 [00:22<00:05, 59.95it/s]

yes
yes
yes
yes
yes


Extracting messages:  75%|██████████████▏    | 834/1114 [00:22<00:03, 74.49it/s]

yes
yes
yes


Extracting messages:  77%|██████████████▋    | 861/1114 [00:23<00:02, 96.06it/s]

yes
yes
yes


Extracting messages:  79%|███████████████    | 883/1114 [00:23<00:03, 72.43it/s]

yes
yes
yes
yes


Extracting messages:  80%|███████████████▎   | 896/1114 [00:23<00:02, 80.23it/s]

yes
yes


Extracting messages:  82%|███████████████▌   | 914/1114 [00:23<00:03, 59.63it/s]

yes
yes
yes


Extracting messages:  86%|████████████████▎  | 954/1114 [00:24<00:02, 58.46it/s]

yes
yes
yes


Extracting messages:  88%|████████████████▋  | 975/1114 [00:26<00:04, 29.05it/s]

yes
yes


Extracting messages:  90%|████████████████▏ | 1002/1114 [00:26<00:02, 44.60it/s]

yes
yes
yes
yes
yes
yes


Extracting messages:  91%|████████████████▎ | 1012/1114 [00:26<00:01, 51.34it/s]

yes
yes
yes
yes


Extracting messages:  95%|█████████████████▏| 1061/1114 [00:27<00:01, 52.64it/s]

yes
yes
yes
yes
yes
yes
yes
yes
yes


Extracting messages:  98%|█████████████████▌| 1090/1114 [00:28<00:00, 63.52it/s]

yes


Extracting messages: 100%|██████████████████| 1114/1114 [00:28<00:00, 38.65it/s]
Reconstructing threads: 100%|█████████████| 132/132 [00:00<00:00, 625591.11it/s]
Extracting email bodies: 100%|█████████████| 128/128 [00:00<00:00, 57193.02it/s]

                                            ThreadID  \
0  <CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...   
1  <0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...   
2   <be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>   
3  <CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...   
4  <CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...   

                                            Messages  \
0  [<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBg...   
1  [<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2b...   
2  [<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit....   
3  [<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHN...   
4  [<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYM...   

                        From                             Date  \
0        placement@nmamit.in  Mon, 18 Mar 2024 12:10:10 +0530   
1  updates@academia-mail.com   Wed, 3 Apr 2024 00:28:09 +0000   
2         circular@nmamit.in  Thu, 21 Mar 2024 12:44:07 +0530   
3        placement@nmamit.in  Fri, 22 Mar 2024 16:33:34 +0530   
4




In [None]:
print(6)

In [None]:
%run mail.py testt.mbox

In [None]:
import pandas as pd
import mailbox
from email import message_from_string
from tqdm import tqdm
import re
import numpy as np
from bs4 import BeautifulSoup
# Load mbox file
mbox_file_path = '/Users/atheethpai/Desktop/ml/testt.mbox'
mbox = mailbox.mbox(mbox_file_path)

# Extract messages and their IDs
messages = {}
select_date = '2023-04-03'  # Replace with your desired date
select_date = pd.to_datetime(select_date).tz_localize('UTC')  # Convert select_date to UTC

for i, message in tqdm(enumerate(mbox), desc='Extracting messages', total=len(mbox)):
    message_id = message.get('Message-ID')
    message_date = pd.to_datetime(message['Date'])
    if message_date.tzinfo is not None:
        message_date = message_date.tz_convert('UTC')
    else:
        message_date = message_date.tz_localize('UTC')

    if message_date > select_date:
#         print("yes")
        from_field = message['From']
        match = re.search(r'<([^>]+)>', from_field)
        if match:
            from_email = match.group(1)
        else:
            from_email = None
        messages[message_id] = {
            'Subject': message['Subject'],
            'From': from_email,
            'To': message['To'],
            'Date': message['Date'],
            'Body': message,
#             .get_payload()
            'References': message.get('References', '').split(),
            'In-Reply-To': message.get('In-Reply-To', '')
        }


# Reconstruct conversation threads
threads = {}
for message_id, message_data in tqdm(messages.items(), desc='Reconstructing threads'):
    thread_id = message_id
    while message_data['In-Reply-To'] in messages:
        thread_id = message_data['In-Reply-To']
        message_data = messages[thread_id]

    if thread_id not in threads:
        threads[thread_id] = {
            'ThreadID': thread_id,
            'Messages': []
        }

    threads[thread_id]['Messages'].append(message_id)


""" ____Custom Message class____ """

class CustomMessage():
    """
    The CusomMessage class represents an email message with three fields:
    - :body:
    - :subject:
    - :content_type: (document, plain text, HTML, image...)
    """
    def __init__(self, body, subject, content_type):

        """
        Constructor
        It tries to find the subject's encoding and decode it accordingly
        It decodes the body based on the content type
        """
        self.content_type = content_type

        # Decode subject if encoded in utf-8
        if isinstance(subject, Header):
            subject = decode_header(subject)[0][0].decode('utf-8')

        # The subject can have several parts encoded in different formats
        # These parts are flagged with strings like '=?UTF-8?'
        if subject is not None and ('=?ISO-' in subject.upper() or '=?UTF-8?' in subject.upper()):
            self.subject = ''
            for subject_part in decode_header(subject):
                # Decode each part based on its encoding
                # The encoding could be returnd by the "decode_header" function
                if subject_part[1] is None:
                    self.subject += strip_payload(subject_part[0].decode())
                else:
                    self.subject += strip_payload(subject_part[0].decode(subject_part[1]))
        elif subject is None:
            # Empty subject
            self.subject = ''
        else:
            # Subject is not encoded or other corner cases that are not considered
            self.subject = strip_payload(subject)

        # Body decoding
        if 'text' in self.content_type:
            # Decode text messages
            try:
                decoded_body = body.decode('utf-8')
            except UnicodeDecodeError:
                decoded_body = body.decode('latin-1')

            if 'html' in self.content_type:
                # If it is an HTML message, remove HTML tags
                h = html2text.HTML2Text()

                h.ignore_links = True
                h.ignore_tables = True
                h.ignore_images = True
                h.ignore_anchors = True
                h.ignore_emphasis = True

                self.body = strip_payload(h.handle(decoded_body))
            else:
                self.body = strip_payload(decoded_body)
        else:
            # If not text, return the body as it is
            self.body = body

    def __str__(self):
        body_length = 2000
        printed_body = self.body[:body_length]
        if 'text' in self.content_type:
            # Shorten long message bodies
            if len(self.body) > body_length:
                printed_body += "..."
        return " ---- Custom Message ---- \n  -- Content Type: {}\n  -- Subject: {}\n  -- Body --\n{}\n\n".format(self.content_type, self.subject, printed_body)

    def get_body(self):
        return self.body

    def get_subject(self):
        return self.subject

    def get_content_type(self):
        return self.content_type

    def create_vector_line(self, label):
        """
        Creates a CSV line with the message's body and given :label:
        Removes any commas from body and label
        """
        return '{body},{label}'.format(body=self.body.replace(',', ''), label=label)

    @staticmethod
    def extract_types_from_messages(messages):
        """
        Takes a list of CustomMessage and extracts all the existing values for content_type
        ['application/ics', 'application/octet-stream', 'application/pdf', 'image/gif', 'image/jpeg',
        'image/png', 'text/calendar', 'text/html', 'text/plain', 'text/x-amp-html']
        """
        types = set()
        for m in messages:
            types.add(m.get_content_type())
        return sorted(types)



""" ____Extraction utils____ """

# def extract_message_payload(mes, parent_subject=None):
#     """
#     Extracts recursively the payload of the messages contained in :mes:
#     When a message is embedded in another, it uses the parameter :parent_subject:
#     to set the subject properly (it uses the parent's subject)
#     """

#     if mes.is_multipart():
#         if parent_subject is None:
#             subject_for_child = mes.get('Subject')
#         else:
#             subject_for_child = parent_subject
#         for part in mes.get_payload():
#             extracted_messages.extend(extract_message_payload(part, subject_for_child))
#     else:
#         extracted_messages.append(CustomMessage(mes.get_payload(decode=True), parent_subject,  mes.get_content_type()))
#     return extracted_messages


def create_classification_line(mes):
    """
    Creates a string representing the email bodies extracted from :mes:
    Ignores images, documents, and calendar messages.
    :mes: should be a list of CustomMessage objects.
    """
    output = ''
    for m in mes:
        if m.get_content_type().startswith('text') and m.get_content_type() != 'text/calendar':
            output += m.get_body() + '\n'
    return output


def extract_message_payload(mes, parent_subject=None):
    """
    Extracts recursively the payload of the messages contained in :mes:
    When a message is embedded in another, it uses the parameter :parent_subject:
    to set the subject properly (it uses the parent's subject)
    """
    extracted_messages = []

    if isinstance(mes, list):
        for part in mes:
            extracted_messages.extend(extract_message_payload(part, parent_subject))
    elif mes.is_multipart():
        if parent_subject is None:
            subject_for_child = mes.get('Subject')
        else:
            subject_for_child = parent_subject
        for part in mes.get_payload():
            extracted_messages.extend(extract_message_payload(part, subject_for_child))
    else:
        extracted_messages.append(CustomMessage(mes.get_payload(decode=True), parent_subject, mes.get_content_type()))

    return extracted_messages

def clean_text(text):
    # Split the text into words
    words = text.split()
    # Keep track of unique words in order
    unique_words = []
    # Iterate over words, adding them to unique_words if they are not already present
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    # Join the unique words back into a single string
    cleaned_text = ' '.join(unique_words)
    return cleaned_text


# Extract email bodies and concatenate them within threads
for thread_id, thread_data in tqdm(threads.items(), desc='Extracting email bodies'):
    email_bodies = []
    extracted_messages=[]
    for msg_id in thread_data['Messages']:
        message = messages[msg_id]
        date = message['Date']
        body=extract_message_payload(message['Body'])
        body1=create_classification_line(body)
        body2=clean_text(body1)


#         body = body.replace('\r', '').replace('\n', '')
        email_bodies.append(body2)


    thread_data['email_body'] = '\n'.join(email_bodies)


# Convert threads to DataFrame
# thread_list = [thread for thread_id, thread in threads.items()]



thread_list = []
for thread_id, thread_data in threads.items():
    thread = {
        'ThreadID': thread_id,
        'Messages': thread_data['Messages'],
        'From': messages[thread_data['Messages'][0]]['From'],
        'Date': messages[thread_data['Messages'][0]]['Date'],
        # Get 'From' from the first message of the thread
        'email_body': thread_data['email_body']
    }
    thread_list.append(thread)

df = pd.DataFrame(thread_list)
# df.to_csv('mamasaid2.csv', index=False)

# Display the DataFrame
print(df.head())

total_null_in_column_A = (df['email_body']=='').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")


Extracting messages: 100%|██████████████████| 1114/1114 [00:30<00:00, 36.97it/s]
Reconstructing threads: 100%|█████████████| 692/692 [00:00<00:00, 643846.13it/s]
Extracting email bodies: 100%|████████████████| 655/655 [00:10<00:00, 60.15it/s]

                                            ThreadID  \
0  <CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...   
1  <0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...   
2   <be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>   
3  <CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...   
4  <CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...   

                                            Messages  \
0  [<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBg...   
1  [<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2b...   
2  [<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit....   
3  [<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHN...   
4  [<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYM...   

                        From                             Date  \
0        placement@nmamit.in  Mon, 18 Mar 2024 12:10:10 +0530   
1  updates@academia-mail.com   Wed, 3 Apr 2024 00:28:09 +0000   
2         circular@nmamit.in  Thu, 21 Mar 2024 12:44:07 +0530   
3        placement@nmamit.in  Fri, 22 Mar 2024 16:33:34 +0530   
4




In [None]:
print(df.head(10))


                                            ThreadID  \
0  <CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...   
1  <0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...   
2   <be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>   
3  <CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...   
4  <CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...   
5  <CAMcJbs=NYxqB_E1PbGojwCjrEq3DHW4cjHFWM6aWdT-m...   
6  <CABDpMfH=ORoOVgQ6X5rPXzHJ0JW9uE68QAWT_BoOKTzQ...   
7  <CAB09PVxoPnAuddbyLBK87Rn7HLddFj85p515Rp+rVtyA...   
8  <CAOgqO=HU07VChrcT7+MnQP=J4Nto7Zb1a9NHfCjywWOi...   
9  <CALvxK0m95VXeu0c=FpzD7Xri9Z_t43rRqr-AETh+QyOw...   

                                            Messages  \
0  [<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBg...   
1  [<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2b...   
2  [<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit....   
3  [<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHN...   
4  [<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYM...   
5  [<CAMcJbs=NYxqB_E1PbGojwCjrEq3DHW4cjHFWM6aWd

In [None]:
data_no_duplicates = df.drop_duplicates(subset=['email_body'])

print("\nDataFrame after dropping duplicates:")
print(data_no_duplicates['email_body'])
data_no_duplicates.to_csv('mamasaidd.csv', index=False)


DataFrame after dropping duplicates:
0     You read the paper <a href="https://www.academ...
1     Dear students,    Warm Greetings from Team Abh...
2                                              \n    \n
3     Greetings!  Karnataka's BIGGEST tech event of ...
4     *Hello Aspirants,*    *Greetings from Eduvetha...
5     Dear Students,    Warm Greetings from Team Abh...
6     Dear Students,    Indian Oil presents SUSTAIN-...
7     Dear Students,    Join us at the Sambhram Audi...
8     You read the paper <a href="https://www.academ...
9     Dear Students,    The Soft Skills and Communic...
10    \nLog in with your student email id.     This ...
11                                        Hello Athe...
12    \nRegister now & get a chance to win..        ...
13    Dear students,    Warm Greetings from Team Abh...
14      ---------- Forwarded message ---------  From...
15    \nUnlock with your work email id.     **T&C; A...
16                                       Hi Atheeth ...
17    Dear

In [None]:
total_null_in_column_A = (df['email_body']==' ').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")


# row_with_date = df[df['Date'] == 'Sat, 02 Mar 2024 00:26:07 +0530']
# print(row_with_date)
from dateutil import parser
df['Date'] = df['Date'].apply(lambda x: parser.parse(x))

# Filter the row with the specified date
row_with_date = df[df['Date'] == '2024-03-02 00:26:07+05:30']
print(row_with_date)

Total null values in column 'A': 24
Total rows in DataFrame: 683
Empty DataFrame
Columns: [ThreadID, Messages, From, Date, email_body]
Index: []


In [None]:
df.to_csv('maaildata3.csv', index=False)

In [None]:
df = df[df['email_body'] != ' ']

total_null_in_column_A = (df['email_body']=='').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")



Total null values in column 'A': 1
Total rows in DataFrame: 45


In [None]:
df.to_csv('maaildata4.csv', index=False)

In [None]:
count_a = df['email_body'].count()
print("Number of entries in column 'A':", count_a)


Number of entries in column 'A': 45


In [None]:
total_null_in_column_A = (df['email_body']=='').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")

df.replace('', np.nan, inplace=True)
df.dropna(subset=['email_body'], inplace=True)

# Show the modified DataFrame
print(df)
total_null_in_column_A = (df['email_body']=='').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")



Total null values in column 'A': 0
Total rows in DataFrame: 44
                                             ThreadID  \
0   <0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...   
1   <CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...   
3   <CAOgqO=HU07VChrcT7+MnQP=J4Nto7Zb1a9NHfCjywWOi...   
4   <CAE70aBDBV59kLaVGS2hYsnh1JX9EAOZ5VMUWoe2XxunQ...   
5   <CADy1pOYyBpWM=yNqi=GMY_sCXYCpbXjBw_8MRsSZhK-n...   
6    <7f8791ff-ec5d-429d-8afe-81ef8626992d@nmamit.in>   
7    <30f2ac00-35ff-4608-800a-c2dfab7aa2c1@nmamit.in>   
8   <0100018ec0415075-5170e77a-d3ee-457c-84e3-2b65...   
9   <CAHgU1Z1vO-NUtjMYWhLmkhoT1MQPD_xMpPE8nTyWva7g...   
10  <0100018ec2884a63-33ca3c39-738c-483c-a6cc-22cd...   
11  <010e018ed8f8a0f6-87382f0f-a9d0-41aa-b10f-4f6a...   
12  <0100018eaa1e5f4f-21b7ce20-4cb5-42f2-8129-b4db...   
13  <CADy1pObP1C35YCXCQgpafQPmK0eAigxxLdJ_mZtDOXPg...   
14   <ab23254d-76ba-4298-a530-1854280de713@nmamit.in>   
15  <0100018ec7db3589-09f12514-9abf-4376-b413-26e7...   
16  <010e018f1c06a273-7d5

In [None]:
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset
import tensorflow as tf

In [None]:
model=TFAutoModel.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import pandas as pd

In [None]:
data=pd.read_csv('/content/sample_data/maildataimp.csv')


In [None]:
import torch

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:

# Load the BERT tokenizer and model
tokenizer1 = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assuming 3 priority levels




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model1=model1.to('cuda')

In [None]:
data=pd.read_csv('/content/sample_data/maildataimp.csv')
df = pd.DataFrame(data)
df = df.iloc[:300]
df.head()

Unnamed: 0,ThreadID,Messages,From,Date,email_body
0,<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...,['<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DB...,placement@nmamit.in,"Mon, 18 Mar 2024 12:10:10 +0530","Dear students, Warm greetings from Team Abhyud..."
1,<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...,['<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2...,updates@academia-mail.com,"Wed, 3 Apr 2024 00:28:09 +0000","You read the paper <a href=""https://www.academ..."
2,<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>,['<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit...,circular@nmamit.in,"Thu, 21 Mar 2024 12:44:07 +0530","Dear all, Greetings from the Department of Lib..."
3,<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...,['<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVH...,placement@nmamit.in,"Fri, 22 Mar 2024 16:33:34 +0530","Dear students, Warm Greetings from Team Abhyud..."
4,<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...,['<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FY...,placement@nmamit.in,"Wed, 17 Apr 2024 14:01:00 +0530","Dear students, Warm Greetings from Team Abhyud..."


In [None]:
row_count = len(df)
print("Number of rows:", row_count)

Number of rows: 300


In [None]:
tokenized_texts = df['email_body'].apply(lambda x: tokenizer1.encode(x, add_special_tokens=True, max_length=512, truncation=True))

In [None]:
# Pad tokenized sequences to equal length
max_len = max(map(len, tokenized_texts))
padded_tokenized_texts = [text + [tokenizer1.pad_token_id] * (max_len - len(text)) for text in tokenized_texts]

In [None]:
# Convert tokenized inputs to tensors
input_ids = torch.tensor(padded_tokenized_texts)
attention_masks = torch.where(input_ids != tokenizer1.pad_token_id, 1, 0)

In [None]:
input_ids = input_ids.to('cuda')
attention_masks = attention_masks.to('cuda')

In [None]:
print(input_ids.device)
print(attention_masks.device)
print(model1.device)


cuda:0
cuda:0
cuda:0


In [None]:
# Prepare the inputs for the model
inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_masks
}

In [None]:
torch.cuda.empty_cache()

In [None]:
# Make predictions
with torch.no_grad():
    outputs = model1(**inputs)

In [None]:
# Get predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
df['predicted_priority'] = predicted_labels

# Map predicted labels to actual priority levels (1, 2, 3)
priority_map = {0: 1, 1: 2, 2: 3}  # Assuming mapping 0->1, 1->2, 2->3
df['predicted_priority'] = df['predicted_priority'].map(priority_map)

# Print the results
print(df[['From', 'Date', 'email_body', 'predicted_priority']])


                                  From                             Date  \
0                  placement@nmamit.in  Mon, 18 Mar 2024 12:10:10 +0530   
1            updates@academia-mail.com   Wed, 3 Apr 2024 00:28:09 +0000   
2                   circular@nmamit.in  Thu, 21 Mar 2024 12:44:07 +0530   
3                  placement@nmamit.in  Fri, 22 Mar 2024 16:33:34 +0530   
4                  placement@nmamit.in  Wed, 17 Apr 2024 14:01:00 +0530   
..                                 ...                              ...   
295  forms-receipts-noreply@google.com  Sat, 16 Mar 2024 05:42:09 +0000   
296                 circular@nmamit.in  Mon, 19 Feb 2024 15:19:13 +0530   
297                team@mail.notion.so  Fri, 13 Oct 2023 10:38:03 +0000   
298                 circular@nmamit.in  Tue, 26 Sep 2023 09:17:14 +0530   
299                placement@nmamit.in  Tue, 16 May 2023 15:56:11 +0530   

                                            email_body  predicted_priority  
0    Dear students, Wa

In [None]:
df.to_csv('modelout1.csv', index=False)

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import torch

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to('cuda')  # Move model to GPU

# Load the CSV file into a DataFrame
data=pd.read_csv('/content/sample_data/maildataimp.csv')
df = pd.DataFrame(data)
df = df.iloc[:300]
df.head()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Unnamed: 0,ThreadID,Messages,From,Date,email_body
0,<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...,['<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DB...,placement@nmamit.in,"Mon, 18 Mar 2024 12:10:10 +0530","Dear students, Warm greetings from Team Abhyud..."
1,<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...,['<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2...,updates@academia-mail.com,"Wed, 3 Apr 2024 00:28:09 +0000","You read the paper <a href=""https://www.academ..."
2,<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>,['<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit...,circular@nmamit.in,"Thu, 21 Mar 2024 12:44:07 +0530","Dear all, Greetings from the Department of Lib..."
3,<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...,['<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVH...,placement@nmamit.in,"Fri, 22 Mar 2024 16:33:34 +0530","Dear students, Warm Greetings from Team Abhyud..."
4,<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...,['<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FY...,placement@nmamit.in,"Wed, 17 Apr 2024 14:01:00 +0530","Dear students, Warm Greetings from Team Abhyud..."


In [None]:
total_null_in_column_A = (df['email_body']=='').sum()
print(f"Total null values in column 'A': {total_null_in_column_A}")

# Calculate total rows in a DataFrame
total_rows = len(df)
print(f"Total rows in DataFrame: {total_rows}")


Total null values in column 'A': 0
Total rows in DataFrame: 300


In [None]:
embeddings = []
for text in data['email_body']:
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to('cuda')
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_state = outputs.last_hidden_state
        mean_embedding = torch.mean(last_hidden_state, dim=1).squeeze(0)  # Mean pooling to get fixed-size embedding
        embeddings.append(mean_embedding.cpu().numpy())

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Add the cluster labels to the dataframe
data['cluster'] = clusters

# Map the cluster labels to priority levels (1, 2, 3)
# You can adjust the mapping based on your understanding of the clusters
cluster_map = {0: 2, 1: 1, 2: 3}  # Assuming mapping 0->1, 1->2, 2->3
data['predicted_priority'] = data['cluster'].map(cluster_map)

# Print the results
print(data[['From', 'Date', 'email_body', 'predicted_priority']])



                          From                             Date  \
0          placement@nmamit.in  Mon, 18 Mar 2024 12:10:10 +0530   
1    updates@academia-mail.com   Wed, 3 Apr 2024 00:28:09 +0000   
2           circular@nmamit.in  Thu, 21 Mar 2024 12:44:07 +0530   
3          placement@nmamit.in  Fri, 22 Mar 2024 16:33:34 +0530   
4          placement@nmamit.in  Wed, 17 Apr 2024 14:01:00 +0530   
..                         ...                              ...   
635         circular@nmamit.in  Thu, 10 Aug 2023 11:39:40 +0530   
636       asha.s@teachnook.com   Sat, 8 Jul 2023 17:18:04 +0530   
637        placement@nmamit.in  Fri, 13 Oct 2023 14:35:12 +0530   
638        team@mail.notion.so  Mon, 16 Oct 2023 07:04:58 +0000   
639      13484@yenepoya.edu.in   Wed, 3 Jan 2024 21:45:00 +0530   

                                            email_body  predicted_priority  
0    Dear students, Warm greetings from Team Abhyud...                   2  
1    You read the paper <a href="https://

In [None]:
cluster_map = {0: 2, 1: 1, 2: 3}  # Assuming mapping 0->1, 1->2, 2->3
data['predicted_priority'] = data['cluster'].map(cluster_map)

NameError: name 'data' is not defined

In [None]:
d1f=pd.read_csv('/content/sample_data/maildataimp.csv')

In [None]:
num_columns = data.shape[1]
print("Number of columns in 'data':", num_columns)

Number of columns in 'data': 7


In [None]:
print("Column names:", list(data.columns))

Column names: ['ThreadID', 'Messages', 'From', 'Date', 'email_body', 'cluster', 'predicted_priority']


In [None]:
data.head()

Unnamed: 0,ThreadID,Messages,From,Date,email_body,cluster,predicted_priority
0,<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DBga...,['<CADy1pOb-JRkLQsMT+GtXoF_C=a7Nb3F3toAWxTU+DB...,placement@nmamit.in,"Mon, 18 Mar 2024 12:10:10 +0530","Dear students, Warm greetings from Team Abhyud...",1,2
1,<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2bd...,['<0100018ea15b2778-f2a37fb0-96d9-49d2-b416-d2...,updates@academia-mail.com,"Wed, 3 Apr 2024 00:28:09 +0000","You read the paper <a href=""https://www.academ...",0,1
2,<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit.in>,['<be8259d6-97e9-4188-99d1-185e1c3c353f@nmamit...,circular@nmamit.in,"Thu, 21 Mar 2024 12:44:07 +0530","Dear all, Greetings from the Department of Lib...",1,2
3,<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVHNV...,['<CADy1pOaKZA6J+61fTM+yGoZkrcZ2cuAvUed6pjuSVH...,placement@nmamit.in,"Fri, 22 Mar 2024 16:33:34 +0530","Dear students, Warm Greetings from Team Abhyud...",1,2
4,<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FYMU...,['<CADy1pOYSgssH+9fDui_rRmMw5Kp7MrJmgky_ycW7FY...,placement@nmamit.in,"Wed, 17 Apr 2024 14:01:00 +0530","Dear students, Warm Greetings from Team Abhyud...",0,1


In [None]:
data.to_csv('mailout3.csv', index=False)

In [None]:
import pandas as pd
data1=pd.read_csv('/content/sample_data/mailout3.csv')
data1['predicted_priority'].replace({1: 2, 2: 1}, inplace=True)
data1.to_csv('mailout4.csv', index=False)


In [None]:
count_priority_1 = data1['predicted_priority'].value_counts().get(3, 0)

print("Number of fields in the 'priority' column with value 1:", count_priority_1)

Number of fields in the 'priority' column with value 1: 149


In [None]:
ls ~/.cache/huggingface/transformers


ls: cannot access '/root/.cache/huggingface/transformers': No such file or directory


In [None]:

import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import torch
import numpy as np
from scipy.stats import entropy

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to('cuda')

# Load labeled and unlabeled datasets
labeled_data = pd.read_csv('/content/sample_data/marged.csv')  # Your labeled dataset
unlabeled_data = pd.read_csv('/content/sample_data/maildataimp copy.csv')  # Unlabeled dataset

# Function to get BERT embeddings
def get_embeddings(texts):
    embeddings = []
    for text in texts:
        tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        input_ids = torch.tensor(tokenized_text).unsqueeze(0).to('cuda')
        with torch.no_grad():
            outputs = model(input_ids)
            last_hidden_state = outputs.last_hidden_state
            mean_embedding = torch.mean(last_hidden_state, dim=1).squeeze(0)
            embeddings.append(mean_embedding.cpu().numpy())
    return np.array(embeddings)

# Get embeddings for labeled data
X_labeled = get_embeddings(labeled_data['email_body'])
y_labeled = labeled_data['label']

# Train initial model
classifier = RandomForestClassifier()
classifier.fit(X_labeled, y_labeled)

# Active learning loop
for i in range(5):  # Loop 5 times or as needed
    # Get embeddings for unlabeled data
    X_unlabeled = get_embeddings(unlabeled_data['email_body'])

    # Predict probabilities on unlabeled data
    probs = classifier.predict_proba(X_unlabeled)

    # Calculate uncertainty (using entropy here as an example)
    uncertainties = entropy(probs, axis=1)

    # Select top N uncertain samples
    uncertain_indices = np.argsort(uncertainties)[-10:]  # Select top 10 uncertain samples
    uncertain_samples = unlabeled_data.iloc[uncertain_indices]

    # Manually label these samples (you already have a labeled dataset, so you can skip this in practice)
    # For demonstration, let's assume you add these labels manually
    true_labels = []
    for i, row in uncertain_samples.iterrows():
      print(f"\nEmail Body:\n{row['email_body']}")
      label = input("Assign the true label (e.g., 1, 2, 3): ")
      true_labels.append(label)

# Assign the manually inputted labels to the dataframe
    uncertain_samples['true_label'] = true_labels


    # Add newly labeled data to the labeled dataset
    labeled_data = pd.concat([labeled_data, uncertain_samples])
    X_labeled = get_embeddings(labeled_data['email_body'])
    y_labeled = labeled_data['label']

    # Remove labeled samples from the unlabeled dataset
    unlabeled_data = unlabeled_data.drop(uncertain_indices)

    # Retrain the model
    classifier.fit(X_labeled, y_labeled)

    # Evaluate the model (optional)
    accuracy = accuracy_score(y_true, classifier.predict(X_labeled))
    print(f'Iteration {i+1}, Accuracy: {accuracy}')

# Final model is trained with active learning


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Email Body:
AICTE ACTIVITY POINT PROGRAMME * CIRCULAR *(05-09-2023) All the students of III year B.E. have to complete their activity given below earn Activity points during V and VI Semester. This is performed any time semester weekends holidays (80 - 90 hours) in own place/village/town on various activities collecting data from neighbourhood. They must submit a report prescribed format attached (maximum two pages only) respective Department NSS Coordinator through Class Representative before *30th January 2024*. Check attachment for more details. Dr. Janardhana Nayak Professor Chemistry, NMAM Institute Technology, Nitte-574 110, Karkala Taluk, Udupi District, Karnataka, India Mobile No. +91 9448101750 Email: jnayak@nitte.edu.in (05-09-2023) 30th 2024.
AICTE ACTIVITY POINT PROGRAMME * CIRCULAR (05-09-2022)* All the students of III year B.E. have to complete their activity given below earn Activity points during V and VI Semester. This is performed any time semester weekends holidays 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncertain_samples['true_label'] = true_labels


ValueError: Input contains NaN

In [None]:
from transformers import BertTokenizer

# Force re-download and clear any existing cache
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=None)


OSError: Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import torch
import numpy as np
from scipy.stats import entropy

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to('cuda')

# Load labeled and unlabeled datasets
labeled_data = pd.read_csv('/content/sample_data/marged.csv')  # Your labeled dataset
unlabeled_data = pd.read_csv('/content/sample_data/maildataimp copy.csv')  # Unlabeled dataset

def clean_labels(df):
    # Convert to numeric, forcing errors to NaN
    df['label'] = pd.to_numeric(df['label'], errors='coerce')
    # Drop rows with NaN labels
    df = df.dropna(subset=['label'])
    # Convert the labels to integers
    df['label'] = df['label'].astype(int)
    return df

labeled_data = clean_labels(labeled_data)


# Function to get BERT embeddings
def get_embeddings(texts):
    embeddings = []
    for text in texts:
        tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        input_ids = torch.tensor(tokenized_text).unsqueeze(0).to('cuda')
        with torch.no_grad():
            outputs = model(input_ids)
            last_hidden_state = outputs.last_hidden_state
            mean_embedding = torch.mean(last_hidden_state, dim=1).squeeze(0)
            embeddings.append(mean_embedding.cpu().numpy())
    return np.array(embeddings)

# Get embeddings for labeled data
X_labeled = get_embeddings(labeled_data['email_body'])
y_labeled = labeled_data['label']

# Train initial model
classifier = RandomForestClassifier()
classifier.fit(X_labeled, y_labeled)

# Active learning loop
for i in range(5):  # Loop 5 times or as needed
    # Get embeddings for unlabeled data
    X_unlabeled = get_embeddings(unlabeled_data['email_body'])

    # Predict probabilities on unlabeled data
    probs = classifier.predict_proba(X_unlabeled)

    # Calculate uncertainty (using entropy here as an example)
    uncertainties = entropy(probs, axis=1)

    # Select top N uncertain samples
    uncertain_indices = np.argsort(uncertainties)[-10:]  # Select top 10 uncertain samples
    uncertain_samples = unlabeled_data.iloc[uncertain_indices]

    # Manually label these samples
    true_labels = []
    for i, row in uncertain_samples.iterrows():
        print(f"\nEmail Body:\n{row['email_body']}")
        label = input("Assign the true label (e.g., 1, 2, 3): ")
        true_labels.append(int(label))

    # Add the manually inputted labels to the uncertain samples dataframe
    uncertain_samples['label'] = true_labels

    # Add newly labeled data to the labeled dataset
    labeled_data = pd.concat([labeled_data, uncertain_samples])

    # Update X_labeled and y_labeled
    X_labeled = get_embeddings(labeled_data['email_body'])
    y_labeled = labeled_data['label']

    # Remove labeled samples from the unlabeled dataset
    unlabeled_data = unlabeled_data.drop(uncertain_indices).reset_index(drop=True)

    # Retrain the model
    classifier.fit(X_labeled, y_labeled)

    # Optional: Evaluate the model on the newly labeled data or a separate validation set
    y_pred = classifier.predict(X_labeled)
    accuracy = accuracy_score(y_labeled, y_pred)
    print(f'Iteration {i+1}, Accuracy: {accuracy}')

# Final model is trained with active learning


OSError: Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
# Print first 5 elements of X_labeled
print("First 5 elements of X_labeled:")
print(X_labeled[:5])

# Print first 5 elements of y_labeled
print("First 5 elements of y_labeled:")
print(y_labeled[:5])


First 5 elements of X_labeled:


NameError: name 'X_labeled' is not defined

In [None]:
import pandas as pd
new_unlabeled_data=pd.read_csv('/content/sample_data/maildataimp copy.csv')
X_new_unlabeled = get_embeddings(new_unlabeled_data['email_body'])
predictions = classifier.predict(X_new_unlabeled)
# Assuming new_unlabeled_data is a DataFrame containing 'email_body'
# and predictions is the array of predicted labels
results_df = pd.DataFrame({
    'email_body': new_unlabeled_data['email_body'],
    'predictions': predictions
})

# Display the DataFrame
print(results_df.head())

NameError: name 'get_embeddings' is not defined