In [20]:
import sys 

class RefComparer:
    ''' Take as input three lists containing information 
    regarding CRISPR guide counts in old reference genome, 
    new reference genome, and novel regions. Find the CRISPR 
    guide sequences that are used in both the old and new 
    references and flag the guides that have different counts 
    between the two references.'''

    def __init__(self, file1Lines, file2Lines, file3Lines):
        '''Initialize a RefComparer object with all attributes
        given at instantiation time.'''
        
        self.dict1 = {}  #hg38
        self.dict2 = {}  #chm13
        self.dict3 = {}  #chm13*
        self.set1 = set()
        self.set2 = set()
        self.set3 = set()
        self.guidesInBoth = set()
        self.flaggedGuides = []
        self.file1Lines = file1Lines
        self.file2Lines = file2Lines
        self.file3Lines = file3Lines
        
    def buildDictList(self):
        '''Read two lists containing CRISPR guide counts from 
        reference genomes and add each element's information to a 
        dictionary and set.'''
      
        for line in self.file1Lines:          # old reference
            x = line.strip()                  # strip new newline characters
            y = x.split('\t')                 # split around white space
            self.dict1[y[0]] = y[1]           # assign guide seq and count as key-value pair
            self.set1.add(y[0])               # add guide sequence to set
        
        for line in self.file2Lines:          # new reference 
            x = line.strip()                  # same procedure
            y = x.split('\t')                           
            self.dict2[y[0]] = y[1]
            self.set2.add(y[0])
         
        for line in self.file3Lines:          # novel regions 
            x = line.strip()                  # same procedure
            y = x.split('\t')                            
            self.dict3[y[0]] = y[1]
            self.set3.add(y[0])
            if y[0] not in self.dict1:        # check if guides in novel regions are in old ref dictionary
                self.dict1[y[0]] = 0          # add key-value pair ith value of zero if not
        
        for line in self.file1Lines:          # re-iterate over old reference
            x = line.strip()                            
            y = x.split('\t') 
            if y[0] not in self.dict3:        # check if guides in old ref are in novel regions dict
                self.dict3[y[0]] = 0          # add key-value pair ith value of zero if not
    
        return (self.dict1, self.dict2, self.dict3, self.set1, self.set2, self.set3)
        
    def setIntersection(self):
        '''Calculate and return the intersection of two sets.'''
        
        self.guidesInBoth = self.set1.intersection(self.set2)   # intersection of old and new ref dicts = guides in both
        
        return self.guidesInBoth
                
    def dictCompare(self):
        '''Compare two dictionaries and store keys that have 
        different values in a list.'''

        for guide in self.guidesInBoth:                         # compare guides cunt in old and new ref   
            if self.dict1[guide] != self.dict2[guide]:          # flag guides that have dif counts 
                self.flaggedGuides.append(guide)                
            else:
                pass
            
        return self.flaggedGuides

    
    
def main():
    '''Use the RefComparer class to find the CRISPR guides that have different counts between old and new reference genomes. 
    Output these guide sequences and their counts in each reference and the novel regions.'''
    
    file1Name = input("Enter the name of your old reference text file: ")     # take old ref file name from user input
    with open(file1Name) as file1:                                           # open file 
        myFile1 = file1.readlines()                                          # create a list which file lines as elements
    file2Name = input("Enter the name of your new reference text file: ")     # same procedure
    with open(file2Name) as file2:
        myFile2 = file2.readlines()
    file3Name = input("Enter the name of your novel regions text file: ")     # same procedure
    with open(file3Name) as file3:
        myFile3 = file3.readlines()
    
    
    classObject = RefComparer(myFile1, myFile2, myFile3)                     # instantiate class object
        
    (dict1, dict2, dict3, list1, list2, list3) = classObject.buildDictList() # build dictonaries for each ref
    
    guidesInBoth = classObject.setIntersection()                             # generate set of guides used in old and new refs
    
    flaggedGuides = classObject.dictCompare()                                # generate list of guides that have diff counts 
    
    
    output = set()    
    
    for flagged in flaggedGuides:                                            # compare counts of flagged guides
        if dict2[flagged] > dict1[flagged]:                                  # only output guides w/ new count > old count 
            output.add((flagged,dict1[flagged],dict2[flagged], abs(int(dict1[flagged])-int(dict2[flagged]))))
        else:
            pass
        
    
    sortedOutput = sorted(output, key = lambda x: (x[3], x[1]), reverse = True)  
    
    print()
    print('There are '+str(len(guidesInBoth))+' CRISPR guides used in both the old and new references.')
    print('Of those, there are '+str(len(guidesInBoth)-len(flaggedGuides))+' CRISPR guides with the same counts and '+str(len(flaggedGuides))+' with different counts.')
    print()
    print('Here are the '+str(len(flaggedGuides))+' guides with different counts, including how often they occur in the novel regions:') 
    print()
    print('{:^24}{:^24}{:^24}{:^24}'.format('Guide',file1Name,file2Name,file3Name))
    print('-'*100)
    
    for x in sortedOutput:
        print('{:^24}{:^24}{:^24}{:^24}'.format(x[0],dict1[x[0]],dict2[x[0]], dict3[x[0]]))
       
    
if __name__ == "__main__":
    main()


Enter the name of your old reference text file: hg38test.txt
Enter the name of your new reference text file: chm13test.txt
Enter the name of your novel regions text file: chm13star-1000.txt

There are 985 CRISPR guides used in both the old and new references.
Of those, there are 302 CRISPR guides with the same counts and 683 with different counts.

Here are the 683 guides with different counts, including how often they occur in the novel regions:

         Guide                hg38test.txt           chm13test.txt         chm13star-1000.txt   
----------------------------------------------------------------------------------------------------
AAAAAAAAAAAAAAAAATGC.TGG          163                     178                      0            
AAAAAAAAAAAAAAAAACTC.AGG           43                      57                      0            
AAAAAAAAAAAAAAAAAATG.TGG          100                     112                      0            
AAAAAAAAAAAAAAAACCTA.TGG           10                      