In [None]:
import spacy
import pandas as pd
import re
import csv
import os
import time

In [None]:
copyrightpath = os.getcwd() + "/YourCSVExample.csv"  ##Assuming that the File is hard-codedly provided
df= pd.read_csv(copyrightpath, error_bad_lines= False, delimiter="\t", quoting=csv.QUOTE_NONE)
df = df.iloc[:100000,:]
df['copyright'] = df['Notice, this list of conditions, the following disclaimer, and the original OpenSSL and SSLeay Licences below.",f']
df['copyright'] = df['copyright'].str.lower()
nlp = spacy.load("en_core_web_sm")


In [None]:
def entityCheck(listA, listB, index, hit_index, clutter_flag):
    THE_PROBABLE_LOGIC_POS_CHECK = ['NOUN', 'NUM', 'PROPN', 'PROPN']
    THE_PROBABLE_LOGIC_NER_CHECK = ['DATE', 'PERSON', 'CARDINAL', 'ORG']
    if set(THE_PROBABLE_LOGIC_POS_CHECK).intersection(set(listA["POS_TAG"])):
        if set(THE_PROBABLE_LOGIC_NER_CHECK).intersection(set(listB["Values"])):
                for _values in listB["Values"]:
                    if THE_PROBABLE_LOGIC_NER_CHECK[0] in _values:
                        print("It is a copyright!!")
                        hit_index.append(index)
                        if clutter_flag:
                            clutterRemoval(index, listB)
                    elif THE_PROBABLE_LOGIC_NER_CHECK[2] in _values:
                        for _val in listB["Entity"]:
                                pattern_regex = r'''((?:19|20)\d{2}|\d{2})(?!\d)(?:[, \t-]{1,3}((?:19|20)\d{2}|\d{1,2}))?'''
                                extract_list = re.search(pattern_regex,_val)
                                if extract_list:
                                    print("It is a copyright!!")
                                    hit_index.append(index)
                                if clutter_flag:
                                    clutterRemoval(index, listB)
                    elif THE_PROBABLE_LOGIC_NER_CHECK[1] in _values or THE_PROBABLE_LOGIC_NER_CHECK[3] in _values:
                        print("It is a copyright!!")
                        hit_index.append(index)
                        if clutter_flag:
                            clutterRemoval(index, listB)
        else:
            print("It is a not a copyright!!")
    return hit_index

In [None]:
def accuracyScore_TP():
    counter = 0
    total_counter = 0
    for index, row in df.iterrows():
        original_tag = df.loc[index, 'new_tag']
        algorithm_tag = df.loc[index,'Hit&Miss']

        if original_tag == "t":
            total_counter += 1
            if algorithm_tag == original_tag:
                counter += 1


    accuracy_score_tp_precision = counter/total_counter
    accuracy_score_tp = str((accuracy_score_tp_precision) * 100) + " %"
    
    return accuracy_score_tp_precision, accuracy_score_tp

In [None]:
def accuracyScore_FN():
    counter = 0
    total_counter = 0
    for index, row in df.iterrows():
        original_tag = df.loc[index, 'new_tag']
        algorithm_tag = df.loc[index,'Hit&Miss']

        if original_tag == "f":
            total_counter += 1
            if algorithm_tag == "t":
                counter += 1
    accuracy_score_fn_precision = counter/total_counter
    accuracy_score_fn = str((accuracy_score_fn_precision) * 100) + " %"
    
    return accuracy_score_fn_precision, accuracy_score_fn

In [None]:
def accuracyScore_TN():
    counter = 0
    total_counter = 0
    for index, row in df.iterrows():
        original_tag = df.loc[index, 'new_tag']
        algorithm_tag = df.loc[index,'Hit&Miss']

        if original_tag == "f":
            total_counter += 1
            if algorithm_tag == "f":
                counter += 1
    accuracy_score_tn_precision = counter/total_counter
    accuracy_score_tn = str((accuracy_score_tn_precision) * 100) + " %"
    
    return accuracy_score_tn_precision, accuracy_score_tn

In [None]:
def accuracyScore_FP():
    counter = 0
    total_counter = 0
    for index, row in df.iterrows():
        original_tag = df.loc[index, 'new_tag']
        algorithm_tag = df.loc[index,'Hit&Miss']

        if original_tag == "t":
            total_counter += 1
            if algorithm_tag == "f":
                counter += 1
    accuracy_score_fp_precision = counter/total_counter
    accuracy_score_fp = str((accuracy_score_fp_precision) * 100) + " %"
    
    return accuracy_score_fp_precision, accuracy_score_fp

In [None]:
def preProcessing():
    start = time.time()
    hit_index = []
    clutter_flag = 0

    ## Iterating through each row and doing preprocessing over it.
    ## Picking out the manual tags from the csv and putting them into seperate column "Original Tag"
    for index, row in df.iterrows():
        text = df.loc[index, 'copyright']
        text_split = text.split(",")
        df.loc[index,'new_tag'] = str(text_split[-1])

        doc = nlp(text)

        if type(text) == float:
            continue
        
        ## Lemmatization
        lemma_list = []
        for token in doc:
            lemma_list.append(token.lemma_)
        
        # Filter the stopword
        filtered_sentence =[] 
        for word in lemma_list:
            lexeme = nlp.vocab[word]
            if lexeme.is_stop == False:
                filtered_sentence.append(word)
        
        # Remove punctuation
        punctuations="?:!.,;"
        for word in filtered_sentence:
            if word in punctuations:
                filtered_sentence.remove(word)

        ## List joining and Filtering (c) and copyright unicode symbol
        list_of_copyrights = " ".join(map(str,filtered_sentence))
        substring = "( c )"
        cp_symbol = '\xa9' ##Unicode for copyright Symbol
        
        if "copyright" not in list_of_copyrights:
            if substring in list_of_copyrights:
                list_of_copyrights = list_of_copyrights.replace(substring, "copyright")
            
            elif cp_symbol in list_of_copyrights:
                list_of_copyrights = list_of_copyrights.replace(cp_symbol, "copyright")
                
        if substring in list_of_copyrights:
            list_of_copyrights = list_of_copyrights.replace(substring, "copyright")

        elif cp_symbol in list_of_copyrights:
            list_of_copyrights = list_of_copyrights.replace(cp_symbol, "copyright")
    
        ## Implementing NER and POS Tags after normalization
        doc2 = nlp(list_of_copyrights)

        ## All the NER taggings will be contained in a dictionary having "Entity" and "Values" as keys
        ent_dict = {}

        full_table_ner = { "Entity": [], "Values": []}

        for x in doc2.ents:
            ent_dict[x.text] = x.label_

        for key in ent_dict:
            full_table_ner["Entity"].append(key)
            full_table_ner["Values"].append(ent_dict[key])
        
        ## All the POS taggings will be contained in a dictionary having "Entity" and "POS_TAGS" as keys
        pos_dict = {}
        full_table_pos = { "Entity": [], "POS_TAG": []}

        for token in doc:
            if not token.is_punct | token.is_space:
                pos_dict[token.text] = token.pos_

        for key in pos_dict:
            full_table_pos["Entity"].append(key)
            full_table_pos["POS_TAG"].append(pos_dict[key])

        ## The checking function call happening with each iteration
        entityCheck(full_table_pos, full_table_ner, index, hit_index, clutter_flag)

    ## Updating the predicted TP results into one different column called "Hit&Miss"
    for i in hit_index:
        df.loc[i,'Hit&Miss'] = "t"

    df["Hit&Miss"].fillna("f", inplace=True)

    tp_precision, percentScore_tp = accuracyScore_TP()
    print(str(percentScore_tp) + " for true positives")

    fn_precision, percentScore_fn = accuracyScore_FN()
    print(str(percentScore_fn) + " for false negatives")

    fp_precision, percentScore_fp = accuracyScore_FP()
    print(str(percentScore_fp) + " for false positives")

    tn_precision, percentScore_tn = accuracyScore_TN()
    print(str(percentScore_tn) + " for true negatives")

    final_accuracy = (tp_precision + tn_precision)/(tp_precision + tn_precision + fp_precision + fn_precision)
    print("Final accuracy seems like: " + str(final_accuracy))

    return

In [None]:
def clutterRemoval(hit_index, ner_list):
    
    string1 = "all rights reserved"
    string2 = "distributed under the mit software license"
    string3 = df.loc[hit_index,'copyright']

    if string1 in string3:
        clutter_removed = string3[:string3.index(string1)]
        df.loc[hit_index,'edited_text'] = clutter_removed 

    elif string2 in string3:
        clutter_removed = string3[:string3.index(string1)]
        df.iloc[hit_index,'edited_text'] = clutter_removed
    
    elif 'ORG' in ner_list['Values'] and 'PERSON' in ner_list['Values']:
        org_name = ner_list['Entity'][ner_list['Values'].index('ORG')]
        person_name = ner_list['Entity'][ner_list['Values'].index('PERSON')]
        if string3.index(org_name) > string3.index(person_name):
            clutter_removed = string3[:string3.index(org_name)] + org_name

            df.iloc[hit_index,'edited_text'] = clutter_removed
        else:    
            clutter_removed = string3[:string3.index(person_name)] + person_name

            df.iloc[hit_index,'edited_text'] = clutter_removed

    
    elif 'ORG' in ner_list['Values']:
        org_name = ner_list['Entity'][ner_list['Values'].index('ORG')]

        clutter_removed = string3[:string3.index(org_name)] + org_name
        df.iloc[hit_index,'edited_text'] = clutter_removed

    elif 'PERSON' in ner_list['Values']:
        person_name = ner_list['Entity'][ner_list['Values'].index('PERSON')]
        clutter_removed = string3[:string3.index(person_name)] + person_name
        df.iloc[hit_index,'edited_text'] = clutter_removed
    
    else:
        clutter_regex = r'''"(Copyright\s*(©)?([\w \-\,\[\]]+)?(\.com)?\.?|Copyright\s*(©)?|©([\w \-\,\.\[\]]+)?(Copyright)?)|(Copyright\s*(\(c\))?|\(c\)\s*(Copyright)?(?:[\w \,\-\.\"\[\]]{2,53}\s*)?\.?|\(c\)\s*(Copyright)?)|Copyright|([\w,]|(\s*\d+(\s(?:\,|-)\s*\d+)?\s*))(\s*\d+(\s*(?:\,|-)\s*\d+)?\,?\s*)\s*[a-zA-Z\&\| ,\s0-9]{3,50}(\.com)?\.?|(\.|\,)?\s*(\@[^>]*?\.com)|Inc+(\.)?| Company+|Corporation+|& Co+|GmbH+|All rights reserved(?:\.)?|Ltd|\<|[\w.]+@[a-zA-Z0-9-_.]+|\>|([A-Za-z0-9]+\.com)|rights\s*reserved|(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-\w+&@#/%=~|$?!:,.]*\)|[-\w+&@#/%=~|$?!:,.])*(?:\([\w+&@#/%=~|$?!:,.]*\)|[\w+&@#/%=~|$])|\<|\>|\(|\)'''
        clutter_removed = re.search(clutter_regex, string3)
        df.loc[hit_index,'edited_text'] = clutter_removed

  