In [1]:
import numpy as np
import pandas as pd
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sent_tokenize

import os
import re

In [2]:
filenames, file_contents, year, month, location = [],[],[],[],[]

for filename in os.listdir("D:\Data's\Donald Trump"):
    filenames.append(filename)

    with open(f"D:\\Data's\\Donald Trump\\{filename}", encoding='utf-8', errors='ignore') as f:
        file_contents.append(f.read())
    
    year.append(filename[-8:-4])
    x = re.search("([A-z]+)([A-z]{3})([0-9]+)\_", filename)
    month.append(x.group(2))
    location.append(''.join(map(lambda x: x if x.islower() else " "+x, x.group(1)))) 

In [3]:
df = pd.DataFrame({
    'Location': location,
    'Month': month,
    'Year':year,
    'filename':filenames,
    'content': file_contents
})

In [4]:
df.shape

(35, 5)

In [5]:
df.head(2)

Unnamed: 0,Location,Month,Year,filename,content
0,Battle Creek,Dec,2019,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...
1,Bemidji,Sep,2020,BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stopwords = list (STOP_WORDS)
punctuation += '\n'
from heapq import nlargest
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def WordFrequencyCalculator(Text):
    mWordFrequencies ={}
    words = Text.split() # split the text into words
    for word in words:
        wordInLowerCase = word.lower() 
            
        if (wordInLowerCase not in stopwords) and (wordInLowerCase not in punctuation):
            if wordInLowerCase not in mWordFrequencies.keys():
                mWordFrequencies[wordInLowerCase] = 1
            else:
                mWordFrequencies[wordInLowerCase] += 1 # finding the freq of each word in the text
    
    return mWordFrequencies

def CalSentScore(content):
    doc = nlp(content)
    sentences = [sent for sent in doc.sents]
    mNumSentences = len(sentences)
    
    mWordFrequencies = WordFrequencyCalculator(content)
    mSentScore = {}
    
    for sent in sentences:
        for word in sent:
            wordInLowerCase = word.text.lower()
            
            if wordInLowerCase in mWordFrequencies.keys():
                if sent not in mSentScore.keys():
                    mSentScore[sent] = mWordFrequencies[wordInLowerCase]
                else:
                    mSentScore[sent] += mWordFrequencies[wordInLowerCase]
    
    return mSentScore, mNumSentences

def SummarizeMyText(content, fractionToReduce=0.01):
    mSentScore, mNumSentences = CalSentScore(content)
        
    reducedSentNum = int(mNumSentences * fractionToReduce)
    
        
    summaryList = nlargest(reducedSentNum, mSentScore, key=mSentScore.get)
    summary = ' '.join([sent.text for sent in summaryList])
    #for sent in summaryList:
        #print(sent, end='')
    return summary
def process_batch(batch):
    summaries = []
    for text in batch:
        summary = SummarizeMyText(text)
        summaries.append(summary)
    return summaries


# Create dataset and dataloader
dataset = df['content']
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

# Process texts using the dataloader
summaries = []    #concurrent.futures module to parallelize the batch processing of texts. Th
with ThreadPoolExecutor() as executor: #This allows multiple batches to be processed simultaneously
    for batch in dataloader: #, leveraging multiple CPU cores.
        result = executor.submit(process_batch, batch)
        summaries.extend(result.result())


In [7]:
df['summaries'] = summaries

In [8]:
df.head()

Unnamed: 0,Location,Month,Year,filename,content,summaries
0,Battle Creek,Dec,2019,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...,"We need more people coming into the country, a..."
1,Bemidji,Sep,2020,BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...,"With your help, your devotion and your drive, ..."
2,Charleston,Feb,2020,CharlestonFeb28_2020.txt,Thank you. Thank you. Thank you. All I can say...,"One of the things I asked them, and I've been ..."
3,Charlotte,Mar,2020,CharlotteMar2_2020.txt,"I want to thank you very much. North Carolina,...","A lot of people don't even know that, but that..."
4,Cincinnati,Aug,2019,CincinnatiAug1_2019.txt,Thank you all. Thank you very much. Thank you ...,Do you think they're going to put great citize...


In [9]:
df.shape

(35, 6)

# Using BART model for Summarizing

In [12]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from concurrent.futures import ThreadPoolExecutor, as_completed
from torch.utils.data import DataLoader, Dataset
import torch

# Initialize the tokenizer and model for DistilBART
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
device = torch.device("cpu")
model.to(device)

# Function to split text into chunks of a specified max length
def split_text(text, max_length=1024):
    tokens = tokenizer.encode(text)
    return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

# Custom dataset to handle the text data
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        chunks = split_text(text, max_length=self.max_length)
        return chunks

def collate_fn(batch):
    flat_list = [item for sublist in batch for item in sublist]
    return torch.nn.utils.rnn.pad_sequence([torch.tensor(chunk) for chunk in flat_list], batch_first=True, padding_value=tokenizer.pad_token_id).to(device)

def summarize_chunk(chunk):
    summary_ids = model.generate(chunk.unsqueeze(0), max_length=100, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Parameters
batch_size = 1
max_length = 1024

# Create the dataset and dataloader
dataset = TextDataset(df['content'], tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

def process_batches(dataloader):
    summaries = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_batch = {executor.submit(summarize_chunk, chunk): chunk for batch in dataloader for chunk in batch}
        for future in as_completed(future_to_batch):
            summaries.append(future.result())
    return summaries

# Process the DataLoader in batches
summaries = process_batches(dataloader)

# Combine summaries for each original text
final_summaries = []
current_summary = []
chunk_counter = 0
for idx in range(len(df)):
    chunks = split_text(df['content'][idx], max_length)
    num_chunks = len(chunks)
    for _ in range(num_chunks):
        current_summary.append(summaries[chunk_counter])
        chunk_counter += 1
    final_summaries.append(" ".join(current_summary))
    current_summary = []

# Add summaries to the DataFrame
df['summary'] = final_summaries

# Post-processing to ensure summaries are single-line
df['summary'] = df['summary'].apply(lambda x: ' '.join(x.split()))
print(df.head())


Token indices sequence length is longer than the specified maximum sequence length for this model (22894 > 1024). Running this sequence through the model will result in indexing errors


        Location Month  Year                   filename  \
0   Battle Creek   Dec  2019  BattleCreekDec19_2019.txt   
1        Bemidji   Sep  2020      BemidjiSep18_2020.txt   
2     Charleston   Feb  2020   CharlestonFeb28_2020.txt   
3      Charlotte   Mar  2020     CharlotteMar2_2020.txt   
4     Cincinnati   Aug  2019    CincinnatiAug1_2019.txt   

                                             content  \
0  Thank you. Thank you. Thank you to Vice Presid...   
1  There's a lot of people. That's great. Thank y...   
2  Thank you. Thank you. Thank you. All I can say...   
3  I want to thank you very much. North Carolina,...   
4  Thank you all. Thank you very much. Thank you ...   

                                           summaries  \
0  We need more people coming into the country, a...   
1  With your help, your devotion and your drive, ...   
2  One of the things I asked them, and I've been ...   
3  A lot of people don't even know that, but that...   
4  Do you think they're goin

In [13]:
df['summary'][1]

'Donald Trump says the media is making things up about Social Security. He says he\'s never seen anything like it: "I\'m the one protecting your Social Security. They\'re going to destroy your social Security" The former New York Gov. said he doesn\'t believe in polls, but poll everything nowadays. President Trump says he\'s thrilled to be here with the \'beautiful, great, hardworking people of this incredible state\' 46 days from now, we\'re going to win Minnesota, and we\'re winning four more years in the White House, he says. Trump: "I\'ve been watching it for years. They haven\'t treated you right." The New York Times reporter spent $2,000,000 on a campaign against Joe Crowley, who was supposed to be speaker of the house. Donald Trump says he\'s not fit to be your President, but he\'s mentally fit. The narrative seems to be Russia, he says, but the FBI has started going Russia. President says Hillary Clinton deleted 33,000 emails, acid washed her phones and acid washed them. He say

In [17]:
df['summaries'][1]

'With your help, your devotion and your drive, we are going to keep on working, we\'re going to keep on fighting, and we are going to keep on winning, winning, winning. Minnesota is going to keep on winning and you\'re going to get tired of winning because Minnesota doesn\'t want to win all the time. We\'re rounding it with or without, but we\'re going to have with, but you know, with his better, but with or without, we\'re rounding that corner, and we\'re rounding it fast and they don\'t want to give us any credit because they want to keep this up \'til November 4th. And you know he\'s got a lot of people that are deep staters, whatever you want to call them. All the guys that got it right, that really got it right, they all got it wrong, totally wrong. We\'re going to keep on going, Jason, we\'re going to keep on winning. But they did it, and we\'ve done a great job and the people of that great state, they understand it, and I just hope the people of your state of Minnesota understan