In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic

In [7]:
import pandas as pd

# Sample text data
texts = [
    "I love playing sports and staying active.",
    "The economy is experiencing a downturn.",
    "The latest movie is a must-watch for all film enthusiasts.",
    "I enjoy reading books and learning new things."
]

# Create DataFrame
data = pd.DataFrame({"text": texts})

data


Unnamed: 0,text
0,I love playing sports and staying active.
1,The economy is experiencing a downturn.
2,The latest movie is a must-watch for all film ...
3,I enjoy reading books and learning new things.


In [9]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(data['text'])



TypeError: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.

In [None]:
from bertopic import BERTopic
import pandas as pd


# Create a BERTopic instance
topic_model = BERTopic()

# Fit the BERTopic model
topics, _ = topic_model.fit_transform(data["text"])

# Get the dominant topics and their associated keywords
topics_keywords = topic_model.get_topic_info()

# Print the topics and their keywords
for topic_id, topic_info in topics_keywords.iterrows():
    print(f"Topic {topic_id}: {topic_info['Keywords']}")

# Assign topics to documents
topics_assigned = topic_model.transform(data["text"])

# Print the topics assigned to each document
for doc_id, topic_id in enumerate(topics_assigned):
    print(f"Document {doc_id}: Topic {topic_id}")

In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
 
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

#topic_model = BERTopic()
#topics, probs = topic_model.fit_transform(docs)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
topic_model.fit_transform(docs)(docs[0:5])

KeyboardInterrupt: 

In [10]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

# Step 1: Dataset Preparation

train_data = {
    'text': [
        "I loved the movie. It was amazing!",
        "The acting was superb.",
        "The plot twist at the end caught me by surprise.",
        "I would highly recommend this film.",
        "The movie was a bit slow in the beginning, but it picked up later."
    ],
    'labels': [1, 1, 1, 1, 0]  # 1 represents positive sentiment, 0 represents negative sentiment
}

val_data = {
    'text': [
        "The cinematography was excellent.",
        "I didn't enjoy the movie. It was boring.",
        "The characters were well-developed.",
        "The story lacked depth.",
        "The film had a great soundtrack."
    ],
    'labels': [1, 0, 1, 0, 1]
}

test_data = {
    'text': [
        "The movie exceeded my expectations.",
        "I found the film to be disappointing.",
        "The special effects were impressive.",
        "The movie didn't live up to the hype.",
        "I was completely engrossed in the storyline."
    ],
    'labels': [1, 0, 1, 0, 1]
}

# Step 2: Tokenization

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Preparing Input Features

def tokenize_data(data):
    return tokenizer.batch_encode_plus(
        data,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_data['text'])
val_encodings = tokenize_data(val_data['text'])
test_encodings = tokenize_data(test_data['text'])

train_labels = torch.tensor(train_data['labels'])
val_labels = torch.tensor(val_data['labels'])
test_labels = torch.tensor(test_data['labels'])

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Step 4: Model Architecture

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 5: Fine-tuning

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Train the model for multiple epochs
for epoch in range(5):
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch+1} - Train Loss: {train_loss}")

# Step 6: Evaluation

def evaluate(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            total += labels.size(0)
            correct += (predictions == labels).sum().item()

    return correct / total

# Evaluate on the validation set
val_accuracy = evaluate(model, val_loader)
print(f"Validation Accuracy: {val_accuracy}")

# Evaluate on the test set
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
test_accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {test_accuracy}")


Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 232k/232k [00:00<00:00, 676kB/s]
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 5.60kB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████| 570/570 [00:00<00:00, 114kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████████████████████████████████████| 440M/440M [00:11<00:00, 39.0MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trai

Epoch 1 - Train Loss: 0.7631610631942749
Epoch 2 - Train Loss: 0.6757266521453857
Epoch 3 - Train Loss: 0.6390253305435181
Epoch 4 - Train Loss: 0.5525497198104858
Epoch 5 - Train Loss: 0.4450497627258301
Validation Accuracy: 0.6
Test Accuracy: 0.6


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

# Step 1: Dataset Preparation

train_data = {
    'text': [
        "I loved the movie. It was amazing!",
        "The acting was superb.",
        "The plot twist at the end caught me by surprise.",
        "I would highly recommend this film.",
        "The movie was a bit slow in the beginning, but it picked up later."
    ],
    'labels': [1, 1, 1, 1, 0]  # 1 represents positive sentiment, 0 represents negative sentiment
}

val_data = {
    'text': [
        "The cinematography was excellent.",
        "I didn't enjoy the movie. It was boring.",
        "The characters were well-developed.",
        "The story lacked depth.",
        "The film had a great soundtrack."
    ],
    'labels': [1, 0, 1, 0, 1]
}

test_data = {
    'text': [
        "The movie exceeded my expectations.",
        "I found the film to be disappointing.",
        "The special effects were impressive.",
        "The movie didn't live up to the hype.",
        "I was completely engrossed in the storyline."
    ],
    'labels': [1, 0, 1, 0, 1]
}

# Step 2: Tokenization

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Preparing Input Features

def tokenize_data(data):
    return tokenizer.batch_encode_plus(
        data,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_data['text'])
val_encodings = tokenize_data(val_data['text'])
test_encodings = tokenize_data(test_data['text'])

train_labels = torch.tensor(train_data['labels'])
val_labels = torch.tensor(val_data['labels'])
test_labels = torch.tensor(test_data['labels'])

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Step 4: Model Architecture

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 5: Fine-tuning

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Train the model for multiple epochs
for epoch in range(5):
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}


In [None]:
https://github.com/prateekjoshi565/Fine-Tuning-BERT/blob/master/Fine_Tuning_BERT_for_Spam_Classification.ipynb

In [11]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import TinyBertForSequenceClassification, TinyBertTokenizer, AdamW

# Step 1: Dataset Preparation

train_data = {
    'text': [
        "I loved the movie. It was amazing!",
        "The acting was superb.",
        "The plot twist at the end caught me by surprise.",
        "I would highly recommend this film.",
        "The movie was a bit slow in the beginning, but it picked up later."
    ],
    'labels': [1, 1, 1, 1, 0]  # 1 represents positive sentiment, 0 represents negative sentiment
}

val_data = {
    'text': [
        "The cinematography was excellent.",
        "I didn't enjoy the movie. It was boring.",
        "The characters were well-developed.",
        "The story lacked depth.",
        "The film had a great soundtrack."
    ],
    'labels': [1, 0, 1, 0, 1]
}

test_data = {
    'text': [
        "The movie exceeded my expectations.",
        "I found the film to be disappointing.",
        "The special effects were impressive.",
        "The movie didn't live up to the hype.",
        "I was completely engrossed in the storyline."
    ],
    'labels': [1, 0, 1, 0, 1]
}

# Step 2: Tokenization

tokenizer = TinyBertTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Preparing Input Features

def tokenize_data(data):
    return tokenizer.batch_encode_plus(
        data,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_data['text'])
val_encodings = tokenize_data(val_data['text'])
test_encodings = tokenize_data(test_data['text'])

train_labels = torch.tensor(train_data['labels'])
val_labels = torch.tensor(val_data['labels'])
test_labels = torch.tensor(test_data['labels'])

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Step 4: Model Architecture

model = TinyBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 5: Fine-tuning

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Train the model for multiple epochs


ImportError: cannot import name 'TinyBertForSequenceClassification' from 'transformers' (C:\Users\vikra\anaconda3\envs\new\lib\site-packages\transformers\__init__.py)

In [None]:
import re

email_text = "Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\nhttps://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_\n_x000D_\n _x000D_\n_x000D_\n _ x000D_\n_x000D_\nThank you,_x00610_\n_x000D_\n _x000D_\n_x000D_\n _x000D_\n_x000D_\nNicole Logtenberg 1They/Theml Client Sery ice Associate_x000D_\n   x000D _ \n _x000D \n _x000D \n_x000D_\n_x000D_\nScotiabank 1 Commercial Banking Distribution_x000D_\n_x000D_\n2 Robert Speck Parkway, Mississauga Executive Centre, 4th Fl oor _x000D_\n_x000D_\nflississauga, ON L4Z1H8 Canada _x000D_\n_x000D_\nT n/a M n/a_x000D_\n_x000D_\nnicole.logtenberg@scoti abank.com <mailto:nicole.logtenberg@scotiabank.com> _x000D_\n_x000D_\nwww.scotiabank.com <https://owa.scotiamail.bns/owa/red ir.aspx?C=E1319A6qpB0GBkyogT8EbXYh6hUfHwNEIgLPPQJrM2FTEKz8rWLn3DnWezOo2s6MYDuciDE53hilSg.8tURL=file%3a%2f%2f%2fC%3a%2fUsers%2fs4 774757%2fAppData%2-fRoaming%2-Ftlicroso-Ft%2fWord%2fwww. scotiabank. corn> _x000D_\n_x000D_\nScotiabank is a business name used by The Bank of Nova Scotia_x000D_\n_x000D_\n _x000D_\n"

# Regular expression pattern
pattern = r"Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_An(.*?)from Co mmercial to small business\.(.*?)Please see SDR below for authorization\._x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\n(.*?)<.*?> _x000D_\n_x000D_\n _x000D_\n_x000D_\n _ x000D_\n_x000D_\nThank you,_x00610_\n_x000D_\n _x000D_\n_x000D_\n _x000D_\n_x000D_\n(.*?)1They/Theml Client Sery ice Associate_x000D_\n   x000D _ \n _x000D \n(.*?)1 Commercial Banking Distribution_x000D_\n_x000D_\n2 Robert Speck Parkway, Mississauga Executive Centre, 4th Fl oor _x000D_\n_x000D_\nflississauga, ON L4Z1H8 Canada _x000D_\n_x000D_\nT n/a M n/a_x000D_\n_x000D_\n(.*?)@scoti abank\.com <mailto:(.*?)> _x000D_\n_x000D_\n(.*?)www\.scotiabank\.com <.*?> _x000D_\n_x000D_\nScotiabank is a business name used by The Bank of Nova Scotia_x000D_\n_x000D_\n _x000D_\n"

# Clean the email text
cleaned_text = re.sub(pattern, "", email_text)

print(cleaned_text)


In [12]:
import re

def clean_email_text(email_text):
    # Remove email addresses
    email_text = re.sub(r'\S+@\S+', '', email_text)
    
    # Remove website addresses
    email_text = re.sub(r'http\S+|www\.\S+', '', email_text)
    
    return email_text

# Example usage
email_text = "Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\nhttps://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_\n_x000D_\n _x000D_\n_x000D_\n _ x000D_\n_x000D_\nThank you,_x00610_\n_x000D_\n _x000D_\n_x000D_\n _x000D_\n_x000D_\nNicole Logtenberg 1They/Theml Client Sery ice Associate_x000D_\n   x000D _ \n _x000D \n _x000D \n_x000D_\n_x000D_\nScotiabank 1 Commercial Banking Distribution_x000D_\n_x000D_\n2 Robert Speck Parkway, Mississauga Executive Centre, 4th Fl oor _x000D_\n_x000D_\nflississauga, ON L4Z1H8 Canada _x000D_\n_x000D_\nT n/a M n/a_x000D_\n_x000D_\nnicole.logtenberg@scoti abank.com <mailto:nicole.logtenberg@scotiabank.com> _x000D_\n_x000D_\nwww.scotiabank.com <https://owa.scotiamail.bns/owa/red ir.aspx?C=E1319A6qpB0GBkyogT8EbXYh6hUfHwNEIgLPPQJrM2FTEKz8rWLn3DnWezOo2s6MYDuciDE53hilSg.8tURL=file%3a%2f%2f%2fC%3a%2fUsers%2fs4 774757%2fAppData%2-fRoaming%2-Ftlicroso-Ft%2fWord%2fwww. scotiabank. corn> _x000D_\n_x000D_\nScotiabank is a business name used by The Bank of Nova Scotia_x000D_\n_x000D_\n _x000D_\n"


cleaned_text = clean_email_text(email_text)
print(cleaned_text)


Morning BSC,_x000D_
_x000D_
 _x000D_
_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_
_x000D_
 _x0 00D_
_x000D_
_x000D_
_x000D_
 _x000D_
_x000D_
 RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D < d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_
_x000D_
 _x000D_
_x000D_
 _ x000D_
_x000D_
Thank you,_x00610_
_x000D_
 _x000D_
_x000D_
 _x000D_
_x000D_
Nicole Logtenberg 1They/Theml Client Sery ice Associate_x000D_
   x000D _ 
 _x000D 
 _x000D 
_x000D_
_x000D_
Scotiabank 1 Commerc

In [13]:
email_text

'Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\nhttps://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_\n_x000D_\n _x000D_\n_x000D_\n _ 

In [None]:
import re

def clean_email(email_text):
    # Remove the pattern '_x000D_\n' and its variations
    cleaned_text = re.sub(r'(_x000D_\\n\s*)+', '', email_text)
    
    # Remove any remaining '_x000D_' occurrences
    cleaned_text = cleaned_text.replace('_x000D_', '')
    
    return cleaned_text

# Example usage
email_text = "Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\nhttps://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_\n_x000


In [14]:
import re

def clean_email(email_text):
    # Remove the pattern '_x000D_\n' and its variations
    cleaned_text = re.sub(r'(_x000D_\\n\s*)+', '', email_text)
    
    # Remove any remaining '_x000D_' occurrences
    cleaned_text = cleaned_text.replace('_x000D_', '')
    
    return cleaned_text

# Example usage
email_text = "Morning BSC,_x000D_\n_x000D_\n _x000D_\n_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_\n_x000D_\n _x0 00D_\n_x000D_\n_x000D_\n_x000D_\n _x000D_\n_x000D_\nhttps://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_\n_x000D_\n _x000D_\n_x000D_\n _ x000D_\n_x000D_\nThank you,_x00610_\n_x000D_\n _x000D_\n_x000D_\n _x000D_\n_x000D_\nNicole Logtenberg 1They/Theml Client Sery ice Associate_x000D_\n   x000D _ \n _x000D \n _x000D \n_x000D_\n_x000D_\nScotiabank 1 Commercial Banking Distribution_x000D_\n_x000D_\n2 Robert Speck Parkway, Mississauga Executive Centre, 4th Fl oor _x000D_\n_x000D_\nflississauga, ON L4Z1H8 Canada _x000D_\n_x000D_\nT n/a M n/a_x000D_\n_x000D_\nnicole.logtenberg@scoti abank.com <mailto:nicole.logtenberg@scotiabank.com> _x000D_\n_x000D_\nwww.scotiabank.com <https://owa.scotiamail.bns/owa/red ir.aspx?C=E1319A6qpB0GBkyogT8EbXYh6hUfHwNEIgLPPQJrM2FTEKz8rWLn3DnWezOo2s6MYDuciDE53hilSg.8tURL=file%3a%2f%2f%2fC%3a%2fUsers%2fs4 774757%2fAppData%2-fRoaming%2-Ftlicroso-Ft%2fWord%2fwww. scotiabank. corn> _x000D_\n_x000D_\nScotiabank is a business name used by The Bank of Nova Scotia_x000D_\n_x000D_\n _x000D_\n"

cleaned_email = clean_email(email_text)
print(cleaned_email)


Morning BSC,

 
AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_

 _x0 00D_



 

https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> 

 

 _ x000D_

Thank you,_x00610_

 

 

Nicole Logtenberg 1They/Theml Client Sery ice Associate
   x000D _ 
 _x000D 
 _x

In [18]:
def remove_lines_with_percentage(text):
    lines = text.split("\n")
    lines_without_percentage = [line for line in lines if "%" not in line]
    cleaned_text = "\n".join(lines_without_percentage)
    return cleaned_text



cleaned_text = remove_lines_with_percentage(remove_text_after_thank_you(email_text))
print(cleaned_text)


Morning BSC,_x000D_
_x000D_
 _x000D_
_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_
_x000D_
 _x0 00D_
_x000D_
_x000D_
_x000D_
 _x000D_
_x000D_
_x000D_
 _x000D_
_x000D_
 _ x000D_
_x000D_
Thank you


In [16]:
def remove_text_after_thank_you(text):
    patterns = ["thank you", "thanks"]
    for pattern in patterns:
        index = text.lower().find(pattern)
        if index != -1:
            text = text[:index + len(pattern)]
    return text

# Example usage
#email_text = "Morning BSC, ... Thank you for your assistance. Have a great day!"

cleaned_text = remove_text_after_thank_you(email_text)
print(cleaned_text)


Morning BSC,_x000D_
_x000D_
 _x000D_
_x000D_AnCan you please update the customer type for BLVD Construction Inc. from Co mmercial to small business. This was authorized in March 2021. Please see SDR below for authorization. x000D_
_x000D_
 _x0 00D_
_x000D_
_x000D_
_x000D_
 _x000D_
_x000D_
https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx? RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlvd%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20D=als%2FTransfer7.20t o%20Small%20Business&FolderCTID=Ox0120003A06EA04D6EED8469261EACEBO8C83FE&View=%7B55641070%2D726F%2D4881%2DAFBD%2D4C1A19B6613 8%7D <https://scotiabank.sharepoint.com/sites/CBSDR21/tor/Forms/AllItems.aspx?RootFolder=%2Fsites%2FCBSDR21%2Ftor%2FB%2FBlv d%20Construction%20Inc%2F1%2DAnnual%20Review%20and%20New%20Deals%2FTransfer%20to%20Small%20Business&FolderCTID=0x0120003A06E A04D6EBD8469261EACEBO8C83FE&View7%7B55B41070%2D726F%2D4881%2DAFBD%2D4C1A19B66138%7D> _x000D_
_x000D_
 _x000D_
_x000D_
 _ x000D_
_x000D_
T

In [21]:
from gensim import corpora, models

def perform_tf_idf_lda_topic_modeling(documents, num_topics):
    # Create a dictionary from the documents
    dictionary = corpora.Dictionary(documents)
    
    # Create a corpus (vector representation of the documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Compute TF-IDF weights for the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    # Build the LDA model using TF-IDF weights
    lda_model = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Get the topics and their keywords
    topics = []
    for topic_id, topic in lda_model.print_topics(num_topics=num_topics):
        keywords = topic.split('"')[1::2]
        topics.append(keywords)
    
    return topics

# Example usage
documents = [
    ["apple", "banana", "fruit", "juice"],
    ["orange", "fruit", "juice"],
    ["apple", "fruit", "pie"],
    ["banana", "fruit"],
    ["orange", "juice"]
]

num_topics = 2

topics = perform_tf_idf_lda_topic_modeling(documents, num_topics)
for topic_id, keywords in enumerate(topics):
    print(f"Topic {topic_id+1}: {keywords}")


Topic 1: ['orange', 'juice', 'fruit', 'banana', 'apple', 'pie']
Topic 2: ['banana', 'apple', 'pie', 'fruit', 'juice', 'orange']


In [20]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
Installing collected packages: Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.34
    Uninstalling Cython-0.29.34:
      Successfully uninstalled Cython-0.29.34
Successfully installed Cython-0.29.28 gensim-4.2.0


In [22]:
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    
    return filtered_tokens

def perform_tf_idf_lda_topic_modeling(documents, num_topics):
    # Preprocess the documents
    processed_documents = [preprocess_text(doc) for doc in documents]
    
    # Create a dictionary from the processed documents
    dictionary = corpora.Dictionary(processed_documents)
    
    # Create a corpus (vector representation of the documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]
    
    # Compute TF-IDF weights for the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    # Build the LDA model using TF-IDF weights
    lda_model = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Assign topics to each document
    document_topics = []
    for doc_bow in corpus:
        topic_dist = lda_model.get_document_topics(doc_bow)
        document_topics.append(topic_dist)
    
    return document_topics

# Example usage
documents = [
    "I love to play soccer",
    "Soccer is a popular sport",
    "I enjoy watching soccer matches",
    "Basketball is also a great sport",
    "I prefer soccer over basketball",
    "Music is my passion",
    "I play the guitar and piano",
    "Singing brings me joy",
    "I listen to different genres of music"
]

num_topics = 2

document_topics = perform_tf_idf_lda_topic_modeling(documents, num_topics)

# Print topics for each document
for doc_id, topics in enumerate(document_topics):
    print(f"Document {doc_id+1}:")
    for topic, prob in topics:
        print(f"Topic {topic+1} - Probability: {prob}")
    print()


Document 1:
Topic 1 - Probability: 0.8138322234153748
Topic 2 - Probability: 0.18616776168346405

Document 2:
Topic 1 - Probability: 0.8543345332145691
Topic 2 - Probability: 0.14566552639007568

Document 3:
Topic 1 - Probability: 0.8840864300727844
Topic 2 - Probability: 0.11591357737779617

Document 4:
Topic 1 - Probability: 0.8526530265808105
Topic 2 - Probability: 0.14734697341918945

Document 5:
Topic 1 - Probability: 0.176798015832901
Topic 2 - Probability: 0.8232020139694214

Document 6:
Topic 1 - Probability: 0.8209992051124573
Topic 2 - Probability: 0.17900077998638153

Document 7:
Topic 1 - Probability: 0.14182975888252258
Topic 2 - Probability: 0.8581702709197998

Document 8:
Topic 1 - Probability: 0.13258852064609528
Topic 2 - Probability: 0.8674114942550659

Document 9:
Topic 1 - Probability: 0.8889032602310181
Topic 2 - Probability: 0.11109672486782074



In [23]:
document_topics

[[(0, 0.8138322), (1, 0.18616776)],
 [(0, 0.85433453), (1, 0.14566553)],
 [(0, 0.88408643), (1, 0.11591358)],
 [(0, 0.852653), (1, 0.14734697)],
 [(0, 0.17679802), (1, 0.823202)],
 [(0, 0.8209992), (1, 0.17900078)],
 [(0, 0.14182976), (1, 0.8581703)],
 [(0, 0.13258852), (1, 0.8674115)],
 [(0, 0.88890326), (1, 0.111096725)]]

In [24]:
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    
    return filtered_tokens

def perform_tf_idf_lda_topic_modeling(documents, num_topics):
    # Preprocess the documents
    processed_documents = [preprocess_text(doc) for doc in documents]
    
    # Create a dictionary from the processed documents
    dictionary = corpora.Dictionary(processed_documents)
    
    # Create a corpus (vector representation of the documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]
    
    # Compute TF-IDF weights for the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    # Build the LDA model using TF-IDF weights
    lda_model = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Get the topics and their keywords
    topics = []
    for topic_id, topic in lda_model.print_topics(num_topics=num_topics):
        keywords = topic.split('"')[1::2]
        topics.append(keywords)
    
    return topics

# Example usage
documents = [
    "I love to play soccer",
    "Soccer is a popular sport",
    "I enjoy watching soccer matches",
    "Basketball is also a great sport",
    "I prefer soccer over basketball",
    "Music is my passion",
    "I play the guitar and piano",
    "Singing brings me joy",
    "I listen to different genres of music"
]

num_topics = 2

topics = perform_tf_idf_lda_topic_modeling(documents, num_topics)
for topic_id, keywords in enumerate(topics):
    print(f"Topic {topic_id+1}: {keywords}")


Topic 1: ['play', 'passion', 'love', 'piano', 'guitar', 'music', 'brings', 'joy', 'singing', 'soccer']
Topic 2: ['basketball', 'sport', 'soccer', 'prefer', 'popular', 'also', 'great', 'enjoy', 'watching', 'matches']


In [26]:
import pandas as pd
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess

import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'text': [
        'This is the first document.',
        'The second document is here.',
        'And this is the third document.',
        'Another document for analysis.',
        'The last document in the set.'
    ]
})

# Print the DataFrame
print(df)
# Preprocess the text data
texts = [simple_preprocess(text) for text in df['text']]

# Create a dictionary from the preprocessed texts
dictionary = Dictionary(texts)

# Create a Bag-of-Words representation for each document
corpus = [dictionary.doc2bow(text) for text in texts]

# Define the number of topics for topic modeling
num_topics = 5

# Train the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

# Get the topic distribution for each document
topic_predictions = [lda_model[doc] for doc in corpus]

# Function to get the most probable topic for a document
def get_most_probable_topic(topic_distribution):
    return max(topic_distribution, key=lambda item: item[1])[0]

# Add a new column to the DataFrame with the assigned topics
df['topics'] = [get_most_probable_topic(topic_dist) for topic_dist in topic_predictions]

# Print the DataFrame with assigned topics
print(df)


   id                             text
0   1      This is the first document.
1   2     The second document is here.
2   3  And this is the third document.
3   4   Another document for analysis.
4   5    The last document in the set.
   id                             text  topics
0   1      This is the first document.       2
1   2     The second document is here.       2
2   3  And this is the third document.       2
3   4   Another document for analysis.       1
4   5    The last document in the set.       3
