In [20]:
import csv
from collections import Counter
import requests

# Download the list of stopwords
stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
response = requests.get(stopwords_url)
stopwords = set(response.text.splitlines())

# Load your existing all_words.csv
word_counts = Counter()
with open("all_words.csv", "r", newline="", encoding="utf-8-sig") as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row
    for word, count in csvreader:
        word_counts[word] = int(count)

# Filter out stopwords and calculate total non-stopword frequency
total_non_stopword_frequency = 0
non_stopword_counts = Counter()

for word, count in word_counts.items():
    if word.lower() not in stopwords:  # Convert word to lowercase for comparison
        non_stopword_counts[word] = count
        total_non_stopword_frequency += count

# Create all_ns_words.csv with non-stopwords and their frequencies and probabilities
with open("all_ns_words.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Word", "Frequency", "Probability"])
    for word, count in sorted(non_stopword_counts.items()):
        probability = int(count) / total_non_stopword_frequency
        csvwriter.writerow([word, count, probability])


In [21]:
# Load the all_ns_words.csv file
non_stopwords = set()

with open("all_ns_words.csv", "r", newline="", encoding="utf-8-sig") as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row
    for row in csvreader:
        word = row[0]
        non_stopwords.add(word)

# Count the number of unique non-stop words
unique_non_stopword_count = len(non_stopwords)

# Print the result
print(f"Number of unique non-stop words: {unique_non_stopword_count}")


Number of unique non-stop words: 3123


In [23]:
# Load the data from all_ns_words.csv
least_probable_words = []
least_probability = float('inf')  # Initialize with a large value

with open("all_ns_words.csv", "r", newline="", encoding="utf-8-sig") as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row

    for row in csvreader:
        word, _, probability = row
        probability = float(probability)

        if probability < least_probability:
            least_probable_words = [word]
            least_probability = probability
        elif probability == least_probability:
            least_probable_words.append(word)

#Print the least Probablility
print("Least Probability:", least_probability)

# Print the least probable words 
print("Least Probable Word(s):")
for word in least_probable_words:
    print(word)



Least Probability: 7.450454477723141e-05
Least Probable Word(s):
1500
15961650
16461716
16851753
171176
17241804
17701831
1912
1A
1B
1D
1E2
1E3
1E4
1E5
1E6
1F
1F1
1F2
1F4
1F5
1F6
20
2001
2004
2019
30
50
5000
501c3
5827
5961887
60
646221541
801
809
84116
ACQUAINTANCE
ACTUAL
AGREEMENT
APPEARANCE
Acquaintance
Additional
Atheists
Author
Awareness
BIBLIOGRAPHICAL
Bannerman
Bismarcks
Books
Bradley
British
Broadly
CONSEQUENTIAL
CONTRACT
Campbell
Cantor
City
Cogito
Compliance
Considered
Contact
Continental
Contributions
Copyright
Creating
DAMAGE
DESCRIPTION
DIRECT
DISCLAIMER
DISTRIBUTE
DISTRIBUTOR
DONATIONS
December
Defect
Degrees
Derivative
Domestic
EIN
ERROR
EXISTENCE
EXPRESS
Earth
East
Email
England
English
Enquiry
Ethics
Euclids
Europe
FALSEHOOD
FITNESS
FOUNDATION
Food
Future
GUTENBERG™
Georg
George
Gilbert
Gordon
Granted
Greeks
Gutenbergs
Gutenberg™’s
Hart
Hegels
Henry
Human
Humemaintained
Humes
IDEALISM
III
IMPLIED
INCIDENTAL
INCLUDING
INDEMNITY
INDIRECT
INDUCTION
INTUITIVE
IRS
Idealists

In [24]:
# Define the sentences
sentence_a = "If a belief is true, it can be deduced it is universal."
sentence_b = "Criticism of knowledge is counter to scientific results."

# Create a dictionary to store word probabilities
word_probabilities = {}

# Open the all_ns_words.csv file and read word probabilities
with open("all_ns_words.csv", "r", newline="", encoding="utf-8-sig") as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip the header row
    for row in csvreader:
        word, _, probability = row
        word_probabilities[word] = float(probability)

# Function to calculate the total probability of a sentence
def calculate_sentence_probability(sentence):
    # Tokenize the sentence and calculate total probability
    words = sentence.split()
    total_probability = sum(word_probabilities.get(word.lower(), 0.0) for word in words)
    return total_probability

# Calculate the total probabilities for both sentences
probability_sentence_a = calculate_sentence_probability(sentence_a)
probability_sentence_b = calculate_sentence_probability(sentence_b)

# Compare and print the results
print("Total Probability for Sentence A:", probability_sentence_a)
print("Total Probability for Sentence B:", probability_sentence_b)

# Determine which sentence is more likely
if probability_sentence_a > probability_sentence_b:
    print("Sentence A is more likely.")
elif probability_sentence_b > probability_sentence_a:
    print("Sentence B is more likely.")
else:
    print("Both sentences have the same total probability.")


Total Probability for Sentence A: 0.008791536283713307
Total Probability for Sentence B: 0.02309640888094174
Sentence B is more likely.
