In [7]:
# This file contains the abstract class ClassificationMethod

class ClassificationMethod:
  """
  ClassificationMethod is the abstract superclass of 
   - MostFrequentClassifier
   - NaiveBayesClassifier
 
  As such, you need not add any code to this file.  You can write
  all of your implementation code in the files for the individual
  classification methods listed above.
  """
  def __init__(self, legalLabels):
    """
    For digits dataset, the set of legal labels will be 0,1,..,9
    For faces dataset, the set of legal labels will be 0 (non-face) or 1 (face)
    """
    self.legalLabels = legalLabels
    
    
  def train(self, trainingData, trainingLabels, validationData, validationLabels):
    """
    This is the supervised training function for the classifier.  Two sets of 
    labeled data are passed in: a large training set and a small validation set.
    
    Many types of classifiers have a common training structure in practice: using
    training data for the main supervised training loop but tuning certain parameters
    with a small held-out validation set.

    For some classifiers (naive Bayes), you will need to return the parameters' 
    values after traning and tuning step.
    
    To make the classifier generic to multiple problems, the data should be represented
    as lists of Counters containing feature descriptions and their counts.
    """
    abstract
    
  def classify(self, data):
    """
    This function returns a list of labels, each drawn from the set of legal labels
    provided to the classifier upon construction.

    To make the classifier generic to multiple problems, the data should be represented
    as lists of Counters containing feature descriptions and their counts.
    """
    abstract


In [8]:
# This file contains feature extraction methods and harness 
# code for data classification

import mostFrequent
import naiveBayes
import samples
import sys
import util

TEST_SET_SIZE = 100
DIGIT_DATUM_WIDTH=28
DIGIT_DATUM_HEIGHT=28
FACE_DATUM_WIDTH=60
FACE_DATUM_HEIGHT=70


def basicFeatureExtractorDigit(datum):
  """
  Returns a set of pixel features indicating whether
  each pixel in the provided datum is white (0) or gray/black (1)
  """
  a = datum.getPixels()

  features = util.Counter()
  for x in range(DIGIT_DATUM_WIDTH):
    for y in range(DIGIT_DATUM_HEIGHT):
      if datum.getPixel(x, y) > 0:
        features[(x,y)] = 1
      else:
        features[(x,y)] = 0
  return features


def analysis(classifier, guesses, testLabels, testData, rawTestData, printImage):
  """
  This function is called after learning.
  Include any code that you want here to help you analyze your results.
  
  Use the printImage(<list of pixels>) function to visualize features.
  
  An example of use has been given to you.
  
  - classifier is the trained classifier
  - guesses is the list of labels predicted by your classifier on the test set
  - testLabels is the list of true labels
  - testData is the list of training datapoints (as util.Counter of features)
  - rawTestData is the list of training datapoints (as samples.Datum)
  - printImage is a method to visualize the features 
  (see its use in the odds ratio part in runClassifier method)
  
  This code won't be evaluated. It is for your own optional use
  (and you can modify the signature if you want).
  """
  
  # Put any code here...
  # Example of use:
  for i in range(len(guesses)):
      prediction = guesses[i]
      truth = testLabels[i]
      if (prediction != truth):
          print("===================================")
          print("Mistake on example %d" % i) 
          print("Predicted %d; truth is %d" % (prediction, truth))
          print("Image: ")
          print(rawTestData[i])
          break


## =====================
## You don't have to modify any code below.
## =====================


class ImagePrinter:
    def __init__(self, width, height):
      self.width = width
      self.height = height

def default(str):
  return str + ' [Default: %default]'

def readCommand( argv ):
  "Processes the command used to run from the command line."
  from optparse import OptionParser  
  parser = OptionParser(USAGE_STRING)
  
  parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'mira', 'minicontest'], default='mostFrequent')
  parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits')
  parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int")
  parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true")
  parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")

  options, otherjunk = parser.parse_args(argv)
  if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
  args = {}
  
  # Set up variables according to the command line input.
  print("Doing classification")
  print("--------------------")
  print("data:\t\t" + options.data)
  print("classifier:\t\t" + options.classifier)
  print("training set size:\t" + str(options.training))
  if(options.data=="digits"):
    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
    featureFunction = basicFeatureExtractorDigit    
  else:
    print("Unknown dataset", options.data)
    print(USAGE_STRING)
    sys.exit(2)
    
  if(options.data=="digits"):
    legalLabels = list(range(10))
  else:
    legalLabels = list(range(2))
    
  if options.training <= 0:
    print("Training set size should be a positive integer (you provided: %d)" % options.training)
    print(USAGE_STRING)
    sys.exit(2)

  if(options.classifier == "mostFrequent"):
    classifier = mostFrequent.MostFrequentClassifier(legalLabels)
  elif(options.classifier == "naiveBayes" or options.classifier == "nb"):
    classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    if (options.autotune):
        print("using automatic tuning for naivebayes")
        classifier.automaticTuning = True
  else:
    print("Unknown classifier:", options.classifier)
    print(USAGE_STRING)
    
    sys.exit(2)

  args['classifier'] = classifier
  args['featureFunction'] = featureFunction
  args['printImage'] = printImage
  
  return args, options

USAGE_STRING = """
  USAGE:      python dataClassifier.py <options>
  EXAMPLES:   (1) python dataClassifier.py
                  - trains the default mostFrequent classifier on the digit dataset
                  using the default 100 training examples and
                  then test the classifier on test data
                 """

# Main harness code

def runClassifier(args, options):

  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training

  rawTrainingData = samples.loadDataFile("trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
  trainingLabels = samples.loadLabelsFile("traininglabels", numTraining)
  rawValidationData = samples.loadDataFile("validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
  validationLabels = samples.loadLabelsFile("validationlabels", TEST_SET_SIZE)
  rawTestData = samples.loadDataFile("testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
  testLabels = samples.loadLabelsFile("testlabels", TEST_SET_SIZE)
    
  
  # Extract features
  print("Extracting features...")
  trainingData = list(map(featureFunction, rawTrainingData))
  validationData = list(map(featureFunction, rawValidationData))
  testData = list(map(featureFunction, rawTestData))
  
  # Conduct training and testing
  print("Training...")
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print("Validating...")
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
  print("Testing...")
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

if __name__ == '__main__':
  # Read input
  args, options = readCommand( sys.argv[1:] ) 
  # Run classifier
  runClassifier(args, options)



Usage: 
  USAGE:      python dataClassifier.py <options>
  EXAMPLES:   (1) python dataClassifier.py
                  - trains the default mostFrequent classifier on the digit dataset
                  using the default 100 training examples and
                  then test the classifier on test data
                 

ipykernel_launcher.py: error: no such option: -f
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Ilan\anaconda3\lib\optparse.py", line 1387, in parse_args
    stop = self._process_args(largs, rargs, values)
  File "C:\Users\Ilan\anaconda3\lib\optparse.py", line 1431, in _process_args
    self._process_short_opts(rargs, values)
  File "C:\Users\Ilan\anaconda3\lib\optparse.py", line 1513, in _process_short_opts
    raise BadOptionError(opt)
optparse.BadOptionError: no such option: -f

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Ilan\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Ilan\AppData\Local\Temp\ipykernel_30496\2738267758.py", line 188, in <module>
    args, options = readCommand( sys.argv[1:] )
  File "C:\Users\Ilan\AppData\Local\Temp\ipykernel_30496\2738267758.py", line 93, in readCommand
    options, otherjunk = parser.parse_args(argv)
  

TypeError: object of type 'NoneType' has no len()

In [3]:
import util
import classificationMethod

class MostFrequentClassifier(classificationMethod.ClassificationMethod):
  """
  The MostFrequentClassifier is a very simple classifier: for
  every test instance presented to it, the classifier returns
  the label that was seen most often in the training data.
  """
  def __init__(self, legalLabels):
    self.guess = None
    self.type = "mostfrequent"
  
  def train(self, data, labels, validationData, validationLabels):
    """
    Find the most common label in the training data.
    """
    counter = util.Counter()
    counter.incrementAll(labels, 1)
    self.guess = counter.argMax()
  
  def classify(self, testData):
    """
    Classify all test data as the most common label.
    """
    return [self.guess for i in testData]

In [4]:
import util
import classificationMethod
import math
# imoprted to accsse the result of the computation
import dataClassifier


class NaiveBayesClassifier(classificationMethod.ClassificationMethod):
    """
  See the project description for the specifications of the Naive Bayes classifier.

  Note that the variable 'datum' in this code refers to a counter of features
  (not to a raw samples.Datum).
  """

    def __init__(self, legalLabels):
        self.legalLabels = legalLabels
        self.type = "naivebayes"
        self.k = 1  # this is the smoothing parameter, ** use it in your train method **
        self.automaticTuning = False  # Look at this flag to decide whether to choose k automatically ** use this in your train method **

    def setSmoothing(self, k):
        """
    This is used by the main method to change the smoothing parameter before training.
    Do not modify this method.
    """
        # 0 seems do give the best results
        self.k = 0

    def train(self, trainingData, trainingLabels, validationData, validationLabels):
        """
    Outside shell to call your method. Do not modify this method.
    """

        self.features = list(trainingData[0].keys())  # this could be useful for your code later...
        if (self.automaticTuning):
            kgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]
        else:
            kgrid = [self.k]

        self.trainAndTune(trainingData, trainingLabels, validationData, validationLabels, kgrid)


    def trainAndTune(self, trainingData, trainingLabels, validationData, validationLabels, kgrid):
        # A list that will save the results of the odds calculations
        odds = []
        # "Zip" the training label with the validation label
        for i, j in zip(trainingLabels, validationLabels):
            # If the validation label is not 0, append to the list the calculated odds
            if j != 0:
                odds.append(f'the odds for sample in position {trainingLabels[i]}, "{i}" is: {i / j:.2f}')
            # If the validation label is 0, "inf" will be appended to prevent devision by 0
            # which will result in an error.
            else:
                odds.append(float('inf'))
        print(odds)


        """
    Trains the classifier by collecting counts over the training data, and
    stores the Laplace smoothed estimates so that they can be used to classify.
    Evaluate each value of k in kgrid to choose the smoothing parameter
    that gives the best accuracy on the held-out validationData.

    trainingData and validationData are lists of feature Counters.  The corresponding
    label lists contain the correct label for each datum.

    To get the list of all possible features or labels, use self.features and
    self.legalLabels.
    """
        # Train the data with the every value of "k" from the list "kgrid".
        for k in kgrid:
            self.setSmoothing(k)
            # Train the data with the chosen value of k
            self.trainSingle(trainingData, trainingLabels)



    def trainSingle(self, trainingData, trainingLabels):
      self.priorProbabilities = util.Counter()
      self.featureProbabilities = {label: {feature: util.Counter() for feature in self.features} for label in
                                   self.legalLabels}

      # Compute prior probabilities
      self.priorProbabilities.incrementAll(trainingLabels, 1)
      self.priorProbabilities.normalize()

      # Compute feature probabilities
      for i in range(len(trainingData)):
        label = trainingLabels[i]
        datum = trainingData[i]

        for feature in self.features:
          if self.featureProbabilities[label][feature] is None:
            self.featureProbabilities[label][feature] = util.Counter()

          self.featureProbabilities[label][feature][datum[feature]] += 1

      # Apply smoothing to feature probabilities
      for label in self.legalLabels:
        for feature in self.features:
          if self.featureProbabilities[label][feature] is None:
            self.featureProbabilities[label][feature] = util.Counter()

          # Use incrementAll to add self.k to each key in the Counter
          self.featureProbabilities[label][feature].incrementAll(
            [self.k] * len(self.featureProbabilities[label][feature]), 1)

          # Normalize the Counter after applying smoothing
          self.featureProbabilities[label][feature].normalize()

    def classify(self, testData):
        """
    Classify the data based on the posterior distribution over labels.

    You shouldn't modify this method.
    """
        guesses = []
        self.posteriors = []  # Log posteriors are stored for later data analysis (autograder).
        for datum in testData:
            posterior = self.calculateLogJointProbabilities(datum)
            guesses.append(posterior.argMax())
            self.posteriors.append(posterior)
        print(guesses)
        return guesses

    def calculateLogJointProbabilities(self, datum):
        """
    Returns the log-joint distribution over legal labels and the datum.
    Each log-probability should be stored in the log-joint counter, e.g.
    logJoint[3] = <Estimate of log( P(Label = 3, datum) )>
    """

        logJoint = util.Counter()

        for label in self.legalLabels:
          logJoint[label] = math.log(self.priorProbabilities[label])

          for feature in self.features:
            # Add a small constant to avoid taking the logarithm of zero
            epsilon = 1e-9
            probability = self.featureProbabilities[label][feature][datum[feature]] + epsilon
            logJoint[label] += math.log(probability)

        return logJoint

    def findHighOddsFeatures(self, label1, label2):

        """
    Returns the 100 best features for the odds ratio:
            P(feature=1 | label1)/P(feature=1 | label2)
    """
        featuresOdds = []


        "*** YOUR CODE HERE ***"
        # Couldnt get it to work
        # for i in self.features:
        #     label1 = self.trainAndTune(trainingLabels=True)
        #     label2 = self.trainAndTune(validationLabels=True)
        #
        #     if label2 != 0:
        #         odds = label1 / label2
        #     else:
        #         odds = float('inf')
        #     featuresOdds.append((i, odds))


        return featuresOdds


In [5]:
import util

## Constants
DATUM_WIDTH = 0 # in pixels
DATUM_HEIGHT = 0 # in pixels

## Module Classes

class Datum:
  """
  A datum is a pixel-level encoding of digits or face/non-face edge maps.

  Digits are from the MNIST dataset and face images are from the 
  easy-faces and background categories of the Caltech 101 dataset.
  
  
  Each digit is 28x28 pixels, and each face/non-face image is 60x74 
  pixels, each pixel can take the following values:
    0: no edge (blank)
    1: gray pixel (+) [used for digits only]
    2: edge [for face] or black pixel [for digit] (#)
    
  Pixel data is stored in the 2-dimensional array pixels, which
  maps to pixels on a plane according to standard euclidean axes
  with the first dimension denoting the horizontal and the second
  the vertical coordinate:
    
    28 # # # #      #  #
    27 # # # #      #  #
     .
     .
     .
     3 # # + #      #  #
     2 # # # #      #  #
     1 # # # #      #  #
     0 # # # #      #  #
       0 1 2 3 ... 27 28
   
  For example, the + in the above diagram is stored in pixels[2][3], or
  more generally pixels[column][row].
       
  The contents of the representation can be accessed directly
  via the getPixel and getPixels methods.
  """
  def __init__(self, data,width,height):
    """
    Create a new datum from file input (standard MNIST encoding).
    """
    DATUM_HEIGHT = height
    DATUM_WIDTH=width
    self.height = DATUM_HEIGHT
    self.width = DATUM_WIDTH
    if data == None:
      data = [[' ' for i in range(DATUM_WIDTH)] for j in range(DATUM_HEIGHT)] 
    self.pixels = util.arrayInvert(convertToInteger(data)) 
    
  def getPixel(self, column, row):
    """
    Returns the value of the pixel at column, row as 0, or 1.
    """
    return self.pixels[column][row]
      
  def getPixels(self):
    """
    Returns all pixels as a list of lists.
    """
    return self.pixels    
      
  def getAsciiString(self):
    """
    Renders the data item as an ascii image.
    """
    rows = []
    data = util.arrayInvert(self.pixels)
    for row in data:
      ascii = list(map(asciiGrayscaleConversionFunction, row))
      rows.append( "".join(ascii) )
    return "\n".join(rows)
    
  def __str__(self):
    return self.getAsciiString()
    


# Data processing, cleanup and display functions
    
def loadDataFile(filename, n,width,height):
  """
  Reads n data images from a file and returns a list of Datum objects.
  
  (Return less then n items if the end of file is encountered).
  """
  DATUM_WIDTH=width
  DATUM_HEIGHT=height
  fin = readlines(filename)
  fin.reverse()
  items = []
  for i in range(n):
    data = []
    for j in range(height):
      data.append(list(fin.pop()))
    if len(data[0]) < DATUM_WIDTH-1:
      # we encountered end of file...
      print("Truncating at %d examples (maximum)" % i)
      break
    items.append(Datum(data,DATUM_WIDTH,DATUM_HEIGHT))
  return items

import zipfile
import os
def readlines(filename):
  "Opens a file or reads it from the zip archive data.zip"
  if(os.path.exists(filename)): 
    return [l[:-1] for l in open(filename).readlines()]
  else: 
    z = zipfile.ZipFile('data.zip')
    return z.read(filename).split('\n')
    
def loadLabelsFile(filename, n):
  """
  Reads n labels from a file and returns a list of integers.
  """
  fin = readlines(filename)
  labels = []
  for line in fin[:min(n, len(fin))]:
    if line == '':
        break
    labels.append(int(line))
  return labels
  
def asciiGrayscaleConversionFunction(value):
  """
  Helper function for display purposes.
  """
  if(value == 0):
    return ' '
  elif(value == 1):
    return '+'
  elif(value == 2):
    return '#'    
    
def IntegerConversionFunction(character):
  """
  Helper function for file reading.
  """
  if(character == ' '):
    return 0
  elif(character == '+'):
    return 1
  elif(character == '#'):
    return 2    

def convertToInteger(data):
  """
  Helper function for file reading.
  """
  if type(data) != type([]):
    return IntegerConversionFunction(data)
  else:
    return list(map(convertToInteger, data))

# Testing

def _test():
  import doctest
  doctest.testmod() # Test the interactive sessions in function comments
  n = 1
#  items = loadDataFile("facedata/facedatatrain", n,60,70)
#  labels = loadLabelsFile("facedata/facedatatrainlabels", n)
  items = loadDataFile("digitdata/trainingimages", n,28,28)
  labels = loadLabelsFile("digitdata/traininglabels", n)
  for i in range(1):
    print(items[i])
    print(items[i])
    print((items[i].height))
    print((items[i].width))
    print(dir(items[i]))
    print(items[i].getPixels())

if __name__ == "__main__":
  _test()  


FileNotFoundError: [Errno 2] No such file or directory: 'data.zip'

In [11]:
!python dataClassifier.py -c naiveBayes


Doing classification
--------------------
data:		digits
classifier:		naiveBayes
training set size:	100
Extracting features...
Training...
['the odds for sample in position 2, "5" is: 0.71', 'the odds for sample in position 5, "0" is: 0.00', 'the odds for sample in position 9, "4" is: 4.00', inf, 'the odds for sample in position 4, "9" is: 2.25', 'the odds for sample in position 4, "2" is: 2.00', 'the odds for sample in position 0, "1" is: 0.25', 'the odds for sample in position 1, "3" is: 0.33', 'the odds for sample in position 0, "1" is: 0.20', 'the odds for sample in position 9, "4" is: 0.44', inf, 'the odds for sample in position 2, "5" is: 0.83', 'the odds for sample in position 1, "3" is: 0.33', inf, 'the odds for sample in position 0, "1" is: 1.00', 'the odds for sample in position 3, "7" is: 1.40', 'the odds for sample in position 4, "2" is: 0.22', 'the odds for sample in position 1, "8" is: 1.14', 'the odds for sample in position 1, "6" is: 2.00', 'the odds for sample in positi