<a href="https://colab.research.google.com/github/Heather-Marsh/move/blob/main/Heather_Marsh_HW_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Document Filtering

Ch 6 from *Programming Collective Intelligence*, based on code from
* https://github.com/arthur-e/Programming-Collective-Intelligence/tree/master/chapter6
* https://go.oreilly.com/old-dominion-university/library/view/programming-collective-intelligence/9780596529321/

**Goal:** Classify email as spam or not spam.

**Implemented Example:** Classify a given document as "bad" or "good".

## General Functions

In [None]:
import sqlite3 as sqlite   # replaces import stmt from book
import re
import math

In [None]:
def getwords(doc):
  splitter=re.compile('\W+')  # different than book
  #print (doc)
  # Split the words by non-alpha characters
  words=[s.lower() for s in splitter.split(doc)
          if len(s)>2 and len(s)<20]

  # Return the unique set of words only
  uniq_words = dict([(w,1) for w in words])

  return uniq_words

## Basic Classifier

In [None]:
class classifier:

  def __init__(self,getfeatures,filename=None):
    # Counts of feature/category combinations
    self.fc={}
    # Counts of documents in each category
    self.cc={}
    self.getfeatures=getfeatures

  # Increase the count of a feature/category pair
  def incf(self,f,cat):
    self.fc.setdefault(f, {})
    self.fc[f].setdefault(cat, 0)
    self.fc[f][cat]+=1

  # Increase the count of a category
  def incc(self,cat):
    self.cc.setdefault(cat, 0)
    self.cc[cat]+=1

  # The number of times a feature has appeared in a category
  def fcount(self,f,cat):
    if f in self.fc and cat in self.fc[f]:
      return float(self.fc[f][cat])
    return 0.0

  # The number of items in a category
  def catcount(self,cat):
    if cat in self.cc:
        return float(self.cc[cat])
    return 0

  # The total number of items
  def totalcount(self):
    return sum(self.cc.values())

  # The list of all categories
  def categories(self):
    return self.cc.keys()

  def train(self,item,cat):
    features=self.getfeatures(item)
    # Increment the count for every feature with this category
    for f in features:
      self.incf(f,cat)

    # Increment the count for this category
    self.incc(cat)

  def fprob(self,f,cat):
    if self.catcount(cat)==0: return 0

    # The total number of times this feature appeared in this
    # category divided by the total number of items in this category
    return self.fcount(f,cat)/self.catcount(cat)

  def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
    # Calculate current probability
    basicprob=prf(f,cat)

    # Count the number of times this feature has appeared in
    # all categories
    totals=sum([self.fcount(f,c) for c in self.categories()])

    # Calculate the weighted average
    bp=((weight*ap)+(totals*basicprob))/(weight+totals)
    return bp

## Naive Bayes Classifier

In [None]:
class naivebayes(classifier):

  def __init__(self,getfeatures):
    classifier.__init__(self,getfeatures)
    self.thresholds={}

  def docprob(self,item,cat):
    features=self.getfeatures(item)

    # Multiply the probabilities of all the features together
    p=1
    for f in features: p*=self.weightedprob(f,cat,self.fprob)
    return p

  def prob(self,item,cat):
    catprob=self.catcount(cat)/self.totalcount()
    docprob=self.docprob(item,cat)
    return docprob*catprob

  def setthreshold(self,cat,t):
    self.thresholds[cat]=t

  def getthreshold(self,cat):
    if cat not in self.thresholds: return 1.0
    return self.thresholds[cat]

  def classify(self,item,default=None):
    probs={}
    # Find the category with the highest probability
    max=0.0
    for cat in self.categories():
      probs[cat]=self.prob(item,cat)
      if probs[cat]>max:
        max=probs[cat]
        best=cat

    # Make sure the probability exceeds threshold*next best
    for cat in probs:
      if cat==best: continue
      if probs[cat]*self.getthreshold(best)>probs[best]: return default
    return best

# Homework 7 - Email Classification

In [None]:
#bad training
file = open('bTrain1.txt', 'r', errors='ignore')
btr1 = file.read()

file = open('bTrain2.txt', 'r', errors='ignore')
btr2 = file.read()

file = open('bTrain3.txt', 'r', errors='ignore')
btr3 = file.read()

file = open('bTrain4.txt', 'r', errors='ignore')
btr4 = file.read()

file = open('bTrain5.txt', 'r', errors='ignore')
btr5 = file.read()

file = open('bTrain6.txt', 'r', errors='ignore')
btr6 = file.read()

file = open('bTrain7.txt', 'r', errors='ignore')
btr7 = file.read()

file = open('bTrain8.txt', 'r', errors='ignore')
btr8 = file.read()

file = open('bTrain9.txt', 'r', errors='ignore')
btr9 = file.read()

file = open('bTrain10.txt', 'r', errors='ignore')
btr10 = file.read()

file = open('bTrain11.txt', 'r', errors='ignore')
btr11 = file.read()

file = open('bTrain12.txt', 'r', errors='ignore')
btr12 = file.read()

file = open('bTrain13.txt', 'r', errors='ignore')
btr13 = file.read()

file = open('bTrain14.txt', 'r', errors='ignore')
btr14 = file.read()

file = open('bTrain15.txt', 'r', errors='ignore')
btr15 = file.read()

file = open('bTrain16.txt', 'r', errors='ignore')
btr16 = file.read()

file = open('bTrain17.txt', 'r', errors='ignore')
btr17 = file.read()

file = open('bTrain18.txt', 'r', errors='ignore')
btr18 = file.read()

file = open('bTrain19.txt', 'r', errors='ignore')
btr19 = file.read()

file = open('bTrain20.txt', 'r', errors='ignore')
btr20 = file.read()

#good training
file = open('gTrain1.txt', 'r', errors='ignore')
gtr1 = file.read()

file = open('gTrain2.txt', 'r', errors='ignore')
gtr2 = file.read()

file = open('gTrain3.txt', 'r', errors='ignore')
gtr3 = file.read()

file = open('gTrain4.txt', 'r', errors='ignore')
gtr4 = file.read()

file = open('gTrain5.txt', 'r', errors='ignore')
gtr5 = file.read()

file = open('gTrain6.txt', 'r', errors='ignore')
gtr6 = file.read()

file = open('gTrain7.txt', 'r', errors='ignore')
gtr7 = file.read()

file = open('gTrain8.txt', 'r', errors='ignore')
gtr8 = file.read()

file = open('gTrain9.txt', 'r', errors='ignore')
gtr9 = file.read()

file = open('gTrain10.txt', 'r', errors='ignore')
gtr10 = file.read()

file = open('gTrain11.txt', 'r', errors='ignore')
gtr11 = file.read()

file = open('gTrain12.txt', 'r', errors='ignore')
gtr12 = file.read()

file = open('gTrain13.txt', 'r', errors='ignore')
gtr13 = file.read()

file = open('gTrain14.txt', 'r', errors='ignore')
gtr14 = file.read()

file = open('gTrain15.txt', 'r', errors='ignore')
gtr15 = file.read()

file = open('gTrain16.txt', 'r', errors='ignore')
gtr16 = file.read()

file = open('gTrain17.txt', 'r', errors='ignore')
gtr17 = file.read()

file = open('gTrain18.txt', 'r', errors='ignore')
gtr18 = file.read()

file = open('gTrain19.txt', 'r', errors='ignore')
gtr19 = file.read()

file = open('gTrain20.txt', 'r', errors='ignore')
gtr20 = file.read()


#bad test
file = open('bTest1.txt', 'r', errors='ignore')
bte1 = file.read()

file = open('bTest2.txt', 'r', errors='ignore')
bte2 = file.read()

file = open('bTest3.txt', 'r', errors='ignore')
bte3 = file.read()

file = open('bTest4.txt', 'r', errors='ignore')
bte4 = file.read()

file = open('bTest5.txt', 'r', errors='ignore')
bte5 = file.read()

#good test
file = open('gTest1.txt', 'r', errors='ignore')
gte1 = file.read()

file = open('gTest2.txt', 'r', errors='ignore')
gte2 = file.read()

file = open('gTest3.txt', 'r', errors='ignore')
gte3 = file.read()

file = open('gTest4.txt', 'r', errors='ignore')
gte4 = file.read()

file = open('gTest5.txt', 'r', errors='ignore')
gte5 = file.read()


In [None]:
def spamTrain(cl):
  cl.train(gtr1, 'good')
  cl.train(gtr2, 'good')
  cl.train(gtr3, 'good')
  cl.train(gtr4, 'good')
  cl.train(gtr5, 'good')
  cl.train(gtr6, 'good')
  cl.train(gtr7, 'good')
  cl.train(gtr8, 'good')
  cl.train(gtr9, 'good')
  cl.train(gtr10, 'good')
  cl.train(gtr11, 'good')
  cl.train(gtr12, 'good')
  cl.train(gtr13, 'good')
  cl.train(gtr14, 'good')
  cl.train(gtr15, 'good')
  cl.train(gtr16, 'good')
  cl.train(gtr17, 'good')
  cl.train(gtr18, 'good')
  cl.train(gtr19, 'good')
  cl.train(gtr20, 'good')

  cl.train(btr1, 'bad')
  cl.train(btr2, 'bad')
  cl.train(btr3, 'bad')
  cl.train(btr4, 'bad')
  cl.train(btr5, 'bad')
  cl.train(btr6, 'bad')
  cl.train(btr7, 'bad')
  cl.train(btr8, 'bad')
  cl.train(btr9, 'bad')
  cl.train(btr10, 'bad')
  cl.train(btr11, 'bad')
  cl.train(btr12, 'bad')
  cl.train(btr13, 'bad')
  cl.train(btr14, 'bad')
  cl.train(btr15, 'bad')
  cl.train(btr16, 'bad')
  cl.train(btr17, 'bad')
  cl.train(btr18, 'bad')
  cl.train(btr19, 'bad')
  cl.train(btr20, 'bad')

In [None]:
cl = classifier(getwords)
spamTrain(cl)
print("")
print("Total items:", cl.totalcount())
print("Categories:", cl.categories())
for cat in cl.categories():
  print(cat, cl.catcount(cat))


Total items: 40
Categories: dict_keys(['good', 'bad'])
good 20.0
bad 20.0


In [None]:
cl = naivebayes(getwords)
spamTrain(cl)

In [None]:
cl.classify(gte1, default='unknown')

'good'

In [None]:
cl.classify(gte2, default='unknown')

'good'

In [None]:
cl.classify(gte3, default='unknown')

'good'

In [None]:
cl.classify(gte4, default='unknown')

'good'

In [None]:
cl.classify(gte5, default='unknown')

'good'

In [None]:
cl.classify(bte1, default='unknown')

'bad'

In [None]:
cl.classify(bte2, default='unknown')

'bad'

In [None]:
cl.classify(bte3, default='unknown')

'bad'

In [None]:
cl.classify(bte4, default='unknown')

'bad'

In [None]:
cl.classify(bte5, default='unknown')

'bad'