In [26]:
import pandas as pd
from collections import Counter
import requests
from bs4 import BeautifulSoup

directory = "../datasets/"
file = "second-debate"
elections_dataset = pd.read_csv(directory + file + ".csv", chunksize=2000)


In [27]:
#scrape fact checking webpage for context

fact_check_url = "https://www.factcheck.org/2016/07/clintons-handling-of-classified-information/"
response = requests.get(fact_check_url)
response.raise_for_status()  # Check for errors

soup = BeautifulSoup(response.text, "html.parser")

scrapped_data = {}

title = soup.find("h1").text
scrapped_data["title"] = title

# change this based on site you are scrapping
site_class = "entry-content"

div = soup.find("div", {"class": site_class})
info = div.find_all("p")

content = []

for p in info:
    content.append(p.text)

scrapped_data["content"] = content

print(scrapped_data)


{'title': 'Clinton’s Handling of Classified Information', 'content': ['An FBI investigation into former Secretary of State Hillary Clinton’s mishandling of classified information resulted in no criminal charges, but it revealed that Clinton and her campaign made statements in the past about her email use that have turned out to be false or misleading:', 'As we did in May, when the State Department’s inspector general issued a report on Clinton’s unusual email arrangement, we will take a look at past statements Clinton has made about her personal emails and private server and how they square with the results of the FBI investigation announced on July 5.', 'Clinton, the presumptive Democratic presidential nominee, has denied mishandling classified information ever since the\xa0New York Times\xa0on March 2, 2015, disclosed that Clinton “exclusively used a personal email account to conduct government business as secretary of state.”', 'At a March 10 press conference, Clinton addressed her 

In [42]:
#text normalization of scrapped data
import re  # For regular expressions
import nltk  # Natural Language Toolkit (if you need stop words or advanced features)

nltk.download('punkt') # Download for sentence tokenization (if needed)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

content = scrapped_data["content"]


def normalize_text(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Punctuation Removal (adjust as needed)
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation except for basic word characters and spaces

    # 3. Tokenization (optional, for word-by-word analysis)
    tokens = nltk.word_tokenize(text) 

    # 4. Stop Word Removal (optional)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Recombine tokens into text (if you did tokenization):
    normalized_text = " ".join(tokens)

    return normalized_text

# Apply normalization to your content
normalized_content = []
for para in content:
    normalized_para = normalize_text(para)
    normalized_content.append(normalized_para)

print(normalized_content)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jerricchan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerricchan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jerricchan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jerricchan/nltk_data...


['fbi investigation former secretary state hillary clintons mishandling classified information resulted criminal charges revealed clinton campaign made statements past email use turned false misleading', 'may state departments inspector general issued report clintons unusual email arrangement take look past statements clinton made personal emails private server square results fbi investigation announced july 5', 'clinton presumptive democratic presidential nominee denied mishandling classified information ever since new york times march 2 2015 disclosed clinton exclusively used personal email account conduct government business secretary state', 'march 10 press conference clinton addressed unusual email arrangement office time said dec 5 2014 gave state department 30490 printed copies workrelated emails clinton said none contained classified information', 'clinton march 10 2015 email classified material anyone email classified material im certainly wellaware classification requirements

In [43]:
#create a word count for the normalized content
word_counts = Counter()

for paragraph in normalized_content:
    words = paragraph.split() 
    word_counts.update(words)

keywords_df = pd.DataFrame.from_dict(dict(word_counts.items()), orient='index', columns=['Count'])
keywords_df.sort_values(by='Count', ascending=False, inplace=True)  # Sort by count (descending)

In [None]:
# with open('keywords.csv', 'w', newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(['Keyword'])  # Header row
#     for keyword in keyword_list:
#         writer.writerow([keyword])


In [85]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


keyword_list = keywords_df.index.tolist()
# keyword_list = [word for word in keyword_list if word.lower() not in words_to_remove] 
related_tweets = []

import csv
iterations = 1000
passed = 0
count = 0
for chunk in elections_dataset:
    found = False
    for index, row in chunk.iterrows():
        passed += 1
        user = f"@{row['user_screen_name']}"
        type = row['tweet_type']
        message = row['text']
        if type != "retweet":
            for keyword in keyword_list:
                matched = False
                for word in message.split():
                    if keyword == word.lower():
                        print(keyword)
                        related_tweets.append(message)
                        count += 1
                        matched = True
                if matched == True:
                    break

        if count == iterations:
            found = True
            break

    if found == True:
        break

print(passed)
print(len(related_tweets))

went
handling
may
clinton
names
debate
case
used
may
going
take
march
us
made
hillary
say
public
nothing
first
dumped
say
us
clinton
never
never
hillary
could
hillary
time
going
one
clinton
us
two
clinton
hand
going
least
us
hillary
matter
one
people
clinton
say
least
private
10
case
take
know
hand
released
told
time
time
instead
going
debate
debate
im
hand
could
fact
going
go
clinton
could
time
hillary
hillary
even
4
clinton
people
back
us
debate
never
people
debate
10
people
take
debate
time
top
news
even
hillary
hillary
one
look
clinton
may
us
never
clinton
hand
hand
last
case
anything
r
presidential
debate
us
hillary
hand
business
one
top
2
one
news
hillary
benghazi
presidential
earlier
emails
campaign
hand
us
time
hillary
still
two
well
hillary
clinton
millions
emails
personal
hillary
way
new
time
state
people
nation
go
people
case
4
going
thought
debate
reasonable
4
end
campaign
done
personal
7
well
happened
one
time
back
way
even
two
brought
clinton
closed
first
one
made
thats
e

In [86]:
hashtags = []

for tweet in related_tweets:
    if "#" in tweet:
        words = tweet.split()  # Split the tweet into words
        for word in words:
            if word.startswith("#"): 
                hashtag = word.lower()  # Convert to lowercase (optional)
                hashtags.append(hashtag)

print(hashtags)

['#iamvoting4jillbecause', '#gogreen', '#steinbaraka', '#debatenight', '#jillstein', '#washudebate2016', '#debate', '#msnbc', '#cnn', '#foxnews', '#debates2016', '#girlonthetrain', '#debate', '#debatenight', '#fightfor15', '#debates2016', '#sexistpig', '#debates2016', '#anonymous', '#justiceforjuanita', '#debates', '#debate', '#debates2016', '#debate', '#hillary', '#makeamericagreatagain', '#politician', '#debates', '#putin', '#wikileaks', '#emails', '#debate', '#trumpforever', '#debates', '#maga', '#trumptrain', '#politician', '#debates', '#debate', '#debatenight', '#debates', '#debatenight', '#maga', '#debates2016', '#podestraemails', '#wikileaksrelease', '#debates2016', '#debates', '#vpdebate', '#nbc4dc', '#debate', '#debatenight', '#debate', '#debates2016', '#debates2016', '#debate', '#debatenight', '#debate', '#debates2016', '#debates2016', '#wolfblitzer', '#debatenight', '#debate', '#debate', '#debate', '#debates', '#pussygrab', '#debates2016', '#debates2016', '#debat…', '#debate