In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tqdm import tqdm
import string
import scipy
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jimbo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
data_0 = pd.read_csv('../datasets/fossology-master-corrected.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../datasets/kubernetes-master-corrected.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../datasets/tensorflow-master-corrected.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../datasets/fossology-provided-1-corrected.csv')

X_3 = data_3['copyright']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

data_4 = pd.read_csv('../datasets/fossology-provided-2.csv')

X_4 = data_4['copyright']
y_4 = data_4['falsePositive']
X_4 = X_4.drop_duplicates()
y_4 = y_4[X_4.index]

data_5 = pd.read_csv('../datasets/feature-extraction-paper.csv')

X_5 = data_5['copyright']
y_5 = data_5['falsePositive']
X_5 = X_5.drop_duplicates()
y_5 = y_5[X_5.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

Class 0 Percentage:  0.7522737712448323
Class 1 Percentage:  0.24772622875516767


In [4]:
def aggregate_reports(reports, print_aggregates=True):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    if print_aggregates:
        print("## Precision")
        print(dfs[0].to_markdown())
        print("## Recall")
        print(dfs[1].to_markdown())
        print("## F1-score")
        print(dfs[2].to_markdown())
    else:
        return dfs[0], dfs[1], dfs[2]

In [5]:
def get_missclassified_rows(X, y_true, y_pred, only_this_class = [0, 1], return_index=False):
    if type(y_true) != list:
        y_true = y_true.tolist()
    if type(y_pred) != list:
        y_pred = y_pred.tolist()
    if type(X) != list:
        X = X.tolist()
    missclassified_rows = []
    for i in range(len(y_true)):
        if y_true[i] != y_pred[i] and y_true[i] in only_this_class:
            missclassified_rows.append(i)
    if return_index:
        return [(y_pred[i], i, X[i]) for i in missclassified_rows]
    else:
        return [(y_pred[i], X[i]) for i in missclassified_rows]

In [6]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [9]:
def preprocess_function(sentences, lower=False, replace_copyright_symbols=False,
                        replace_dates=False, remove_numbers=False, remove_punctuation=False,
                        remove_special_characters=False, remove_whitespaces=False, remove_specials_weird=False,
                        remove_stopwords=False, replace_emails=False, replace_names=False, replace_orgs=False,
                        tokenize=False, lemmatize=False, glove=False):
    if type(sentences) is not list:
        sentences = sentences.to_list()
    if replace_dates:
        if glove:
            sentences = [re.sub(r'\d{4}', ' <DATE> ', sentence) for sentence in sentences]
        else:
            sentences = [re.sub(r'\d{4}', ' DATE ', sentence) for sentence in sentences]
    if remove_numbers:
        sentences = [re.sub(r'\d+', ' ', sentence) for sentence in sentences]
    if replace_copyright_symbols:
        if glove:
            symbol_text = ' <COPYRIGHT SYMBOL> '
        else:
            symbol_text = ' COPYRIGHTSYMBOL '
        sentences = [re.sub(r'©', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(c\)', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(C\)', symbol_text, sentence) for sentence in sentences]
    if replace_emails:
        if glove:
            email_text = ' <EMAIL> '
        else:
            email_text = ' EMAIL '
        sentences = [re.sub("""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", email_text, sentence) for sentence in sentences]
    if replace_names:
        pass # TODO: Implement this using NER if needed
    if replace_orgs:
        pass # TODO: Implement this using NER if needed
    if tokenize:
        if not glove:
            sentences = [''.join(tokenizer.tokenize(sentence)) for sentence in sentences]
        else:
            sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        if not glove:
            sentences = [''.join([lemmatizer.lemmatize(word) for word in sentence]) for sentence in sentences]
        else:
            sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in sentences]
    if remove_punctuation:
        sentences = [re.sub(r'[^\w\s]', ' ', sentence) for sentence in sentences]
    if remove_special_characters:
        sentences = [re.sub(r'[^a-zA-Z0-9]', ' ', sentence) for sentence in sentences]
    if remove_specials_weird:
        sentences = [re.sub(r'[^a-zA-Z0-9]', '', sentence) for sentence in sentences]
    if lower:
        sentences = [sentence.lower() for sentence in sentences]
    if remove_stopwords:
        sentences = [re.sub(r'\b(?:{})\b'.format('|'.join(stopwords.words('english'))), ' ', sentence)
                     for sentence in sentences]
    if remove_whitespaces:
        sentences = [re.sub(r' {2,}', ' ', sentence) for sentence in sentences]
    return sentences

In [81]:
# Train to save space in code
def train(svm, vectorizer, threshold, preprocess_function,**kwargs):
    X_train_tfidf = vectorizer.fit_transform(preprocess_function(X_train, **kwargs))
    X_test_tfidf = vectorizer.transform(preprocess_function(X_test, **kwargs))
    X_1_tfidf = vectorizer.transform(preprocess_function(X_1, **kwargs))
    X_2_tfidf = vectorizer.transform(preprocess_function(X_2, **kwargs))
    X_3_tfidf = vectorizer.transform(preprocess_function(X_3, **kwargs))
    X_tfidf = vectorizer.transform(preprocess_function(X, **kwargs))
    svm.fit(X_train_tfidf, y_train)
    if True: #svm.probability:
        y_pred = svm.predict_proba(X_test_tfidf)
        y_pred_1 = svm.predict_proba(X_1_tfidf)
        y_pred_2 = svm.predict_proba(X_2_tfidf)
        y_pred_3 = svm.predict_proba(X_3_tfidf)
        y_pred_4 = svm.predict_proba(X_tfidf)
        if threshold is None:
            y_pred_classification = np.argmax(y_pred, axis=1)
            y_pred_1_classification = np.argmax(y_pred_1, axis=1)
            y_pred_2_classification = np.argmax(y_pred_2, axis=1)
            y_pred_3_classification = np.argmax(y_pred_3, axis=1)
            y_pred_4_classification = np.argmax(y_pred_4, axis=1)
        else:
            y_pred_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred]
            y_pred_1_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_1]
            y_pred_2_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_2]
            y_pred_3_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_3]
            y_pred_4_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_4]
    else:
        y_pred_classification = svm.predict(X_test_tfidf)
        y_pred_1_classification = svm.predict(X_1_tfidf)
        y_pred_2_classification = svm.predict(X_2_tfidf)
        y_pred_3_classification = svm.predict(X_3_tfidf)
        y_pred_4_classification = svm.predict(X_tfidf)
    report = classification_report(y_test, y_pred_classification, output_dict=True)
    report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
    report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
    report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
    report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
    miss_classified_rows_0 = get_missclassified_rows(X, y, y_pred_4_classification, only_this_class=[0], return_index=True)
    miss_classified_rows_1 = get_missclassified_rows(X, y, y_pred_4_classification, only_this_class=[1], return_index=True)
    #aggregate_reports([report, report_1, report_2, report_3, report_4])
    print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
    print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')
    return svm, vectorizer, miss_classified_rows_0, miss_classified_rows_1

# NER testing out fine tuned tiny BERT (doesn't have license attached to it) - pretrained on conll2003

In [8]:
# Import libraries
import torch
import transformers
import pandas as pd
from sklearn.metrics import f1_score
from IPython.display import HTML, display

In [68]:
def merge_split_symbols(tokens):
  merged_tokens = []
  for token in tokens:
    if token.startswith("##"):
      merged_tokens[-1] += token[2:]
    else:
      merged_tokens.append(token)
  return merged_tokens


In [108]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("gagan3012/bert-tiny-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("gagan3012/bert-tiny-finetuned-ner")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
processed_X = preprocess_function(X, replace_dates=True, remove_numbers=True)

In [109]:
def merge_entities(entities):
    merged_entities = []
    current_entity = None
    current_label = None
    for entity in entities:
        word = entity["word"]
        label = entity["entity"]
        if word.startswith("##"):
            current_entity += word[2:]
        else:
            if current_entity is not None:
                merged_entities.append({"word": current_entity, "entity": current_label})
            current_entity = word
            current_label = label
    if current_entity is not None:
        merged_entities.append({"word": current_entity, "entity": current_label})
    for entity in merged_entities:
        entity_label = entity["entity"]
        if entity_label != "LABEL_0":
            for other_entity in merged_entities:
                if other_entity["word"] == entity["word"] and other_entity["entity"] != "LABEL_0" and other_entity["entity"] != entity_label:
                    entity["entity"] = other_entity["entity"]
                    break
    return merged_entities

In [110]:
entities = [nlp(sentence) for sentence in tqdm(processed_X)]
merged_entities = [merge_entities(entity) for entity in tqdm(entities)]

  0%|          | 0/21770 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 21770/21770 [01:00<00:00, 361.82it/s]
100%|██████████| 21770/21770 [00:00<00:00, 62356.12it/s]


In [111]:
label_colors = {
    'LABEL_0': 'white', # Other
    'LABEL_1': 'red', # B-PER
    'LABEL_2': 'green', # I-PER
    'LABEL_3': 'blue', # B-ORG
    'LABEL_4': 'orange', # I-ORG
    'LABEL_5': 'purple', # B-LOC
    'LABEL_6': 'brown', # I-LOC
    'LABEL_7': 'pink', # B-MISC
    'LABEL_8': 'gray', # I-MISC
}
def get_colored_text_from_entity(entity_list, label_colors):
    colored_text = ''
    for entity in entity_list:
        word = entity["word"]
        entity_label = entity["entity"]
        label_color = label_colors.get(entity_label, "white")
        colored_word = f'<font color="{label_color}">{word}</font>'
        colored_text += colored_word + " "
    return colored_text

def display_colored_text(colored_texts):
    for colored_text in colored_texts:
        display(HTML(colored_text))

In [112]:
colored_texts = [get_colored_text_from_entity(entity, label_colors) for entity in tqdm(merged_entities)]

100%|██████████| 21770/21770 [00:00<00:00, 121337.51it/s]


In [120]:
display_colored_text(colored_texts[200:300])

In [122]:
display_colored_text(colored_texts[300:400])

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("gagan3012/bert-tiny-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("gagan3012/bert-tiny-finetuned-ner")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
def merge_entities(entities):
    merged_entities = []
    current_entity = None
    current_label = None
    for entity in entities:
        word = entity["word"]
        label = entity["entity"]
        if word.startswith("##"):
            current_entity += word[2:]
        else:
            if current_entity is not None:
                merged_entities.append({"word": current_entity, "entity": current_label})
            current_entity = word
            current_label = label
    if current_entity is not None:
        merged_entities.append({"word": current_entity, "entity": current_label})
    for entity in merged_entities:
        entity_label = entity["entity"]
        if entity_label != "LABEL_0":
            for other_entity in merged_entities:
                if other_entity["word"] == entity["word"] and other_entity["entity"] != "LABEL_0" and other_entity["entity"] != entity_label:
                    entity["entity"] = other_entity["entity"]
                    break
    return merged_entities

In [155]:
def preprocess_function(sentences, lower=False, replace_copyright_symbols=False,
                        replace_dates=False, remove_numbers=False, remove_punctuation=False,
                        remove_special_characters=False, remove_whitespaces=False, remove_specials_weird=False,
                        remove_stopwords=False, replace_emails=False, replace_names=False, replace_orgs=False,
                        tokenize=False, lemmatize=False, glove=False, remove_endlines=False,
                        replace_copyright_symbol_v2=False):
    if type(sentences) is not list:
        sentences = sentences.to_list()
    if replace_dates:
        if glove:
            sentences = [re.sub(r'\d{4}', ' <DATE> ', sentence) for sentence in sentences]
        else:
            sentences = [re.sub(r'\d{4}', ' DATE ', sentence) for sentence in sentences]
    if remove_numbers:
        sentences = [re.sub(r'\d+', ' ', sentence) for sentence in sentences]
    if replace_copyright_symbols:
        if glove:
            symbol_text = ' <COPYRIGHT SYMBOL> '
        else:
            symbol_text = ' COPYRIGHTSYMBOL '
        sentences = [re.sub(r'©', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(c\)', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(C\)', symbol_text, sentence) for sentence in sentences]
    if replace_copyright_symbol_v2:
        if glove:
            symbol_text = ' <COPYRIGHT SYMBOL> '
        else:
            symbol_text = ' COPYRIGHT SYMBOL '
        sentences = [re.sub(r'©', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(c\)', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(C\)', symbol_text, sentence) for sentence in sentences]
    if replace_emails:
        if glove:
            email_text = ' <EMAIL> '
        else:
            email_text = ' EMAIL '
        sentences = [re.sub("""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", email_text, sentence) for sentence in sentences]
    if tokenize:
        if not glove:
            sentences = [''.join(tokenizer.tokenize(sentence)) for sentence in sentences]
        else:
            sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        if not glove:
            sentences = [''.join([lemmatizer.lemmatize(word) for word in sentence]) for sentence in sentences]
        else:
            sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in sentences]
    if remove_punctuation:
        sentences = [re.sub(r'[^\w\s]', ' ', sentence) for sentence in sentences]
    if remove_special_characters:
        sentences = [re.sub(r'[^a-zA-Z0-9]', ' ', sentence) for sentence in sentences]
    if remove_specials_weird:
        sentences = [re.sub(r'[^a-zA-Z0-9]', '', sentence) for sentence in sentences]
    if lower:
        sentences = [sentence.lower() for sentence in sentences]
    if remove_stopwords:
        sentences = [re.sub(r'\b(?:{})\b'.format('|'.join(stopwords.words('english'))), ' ', sentence)
                     for sentence in sentences]
    if replace_names:
        entities = [nlp(sentence) for sentence in sentences]
        merged_entities = [merge_entities(entity) for entity in entities]
        merged_entities = [[{'entity': entity['entity'] ,'word': 'PERSON' if entity['entity'] == 'LABEL_1' or entity['entity'] == 'LABEL_2' else entity['word']} for entity in merged_entity] for merged_entity in merged_entities]
        if replace_orgs:
            merged_entities = [[{'entity': entity['entity'], 'word': 'ORG' if entity['entity'] == 'LABEL_3' or entity['entity'] == 'LABEL_4' else entity['word']} for entity in merged_entity] for merged_entity in merged_entities]
        sentences = [' '.join([entity['word'] for entity in merged_entity]) for merged_entity in merged_entities]
    if replace_orgs:
        if not replace_names:
            entities = [nlp(sentence) for sentence in sentences]
            merged_entities = [merge_entities(entity) for entity in entities]
            merged_entities = [[{'entity': entity['entity'], 'word': 'ORG' if entity['entity'] == 'LABEL_3' or entity['entity'] == 'LABEL_4' else entity['word']} for entity in merged_entity] for merged_entity in merged_entities]
            sentences = [' '.join([entity['word'] for entity in merged_entity]) for merged_entity in merged_entities]
    if remove_endlines:
        sentences = [re.sub(r'\n', ' ', sentence) for sentence in sentences]
    if remove_whitespaces:
        sentences = [re.sub(r' {2,}', ' ', sentence) for sentence in sentences]
    return sentences

In [25]:
clf = OneVsRestClassifier(SVC(probability=True, C=25))
test_1 = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True)

Number of missclassifications in class 0:  26.0 out of a total sample of:  16377.0  - about  0.16 % of the class was missclassified
Number of missclassifications in class 1:  25.0 out of a total sample of:  5393.0  - about  0.46 % of the class was missclassified


In [63]:
clf = OneVsRestClassifier(SVC(probability=True, C=25))
only_names = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True, replace_names=True) ## 0.26

Number of missclassifications in class 0:  19.0 out of a total sample of:  16377.0  - about  0.12 % of the class was missclassified
Number of missclassifications in class 1:  28.0 out of a total sample of:  5393.0  - about  0.52 % of the class was missclassified


In [34]:
clf = OneVsRestClassifier(SVC(probability=True, C=25))
only_orgs = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True, replace_orgs=True) ## 0.26

Number of missclassifications in class 0:  31.0 out of a total sample of:  16377.0  - about  0.19 % of the class was missclassified
Number of missclassifications in class 1:  44.0 out of a total sample of:  5393.0  - about  0.82 % of the class was missclassified


In [238]:
only_orgs[2]

[(1,
  10,
  'copyright (c) 2012: Scott Jehl, Paul Irish, Nicholas Zakas. Dual MIT/BSD license */ NOTE: If you\'re already including a window.matchMedia polyfill via Modernizr or otherwise, you don\'t need this part */ window.matchMedia=window.matchMedia||function(a){"use strict";var c,d=a.documentElement,e=d.f'),
 (1,
  87,
  'copyright RSA Data Security, Inc. Their notice is reproduced below in its entirety.'),
 (1,
  184,
  "Copyright 1988,1990,1993 by Paul Vixie\\n\\nAll rights reserved\\n\\nDistribute freely, except: don't remove my name from the source or documentation (don't take credit for my work), mark your changes (don't get me blamed for your possible bugs), don't alter or remove this notice. May be sold if buildab"),
 (1,
  292,
  "Copyright Flag\x018BIM' Japanese Print Flags 8BIM\x03��\x17Color Halftone SettingsH/ff\x01lff /ff\x01������ 2\x01Z 5\x01- 8BIM\x03��\x17Color Transfer Settingsp��������������������������������������������\x03�������������������������������������

In [240]:
sentences = []
for row in only_orgs[2]:
    sentences.append(row[2])
sentences = preprocess_function(sentences, replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                replace_emails=True, replace_orgs=True)

sentences

['copyright copyrightsymbol date scott jehl ORG irish nicholas zakas dual ORG ORG license note if you re already including a window matchmedia polyfill via modernizr or otherwise you don t need this part window matchmedia window matchmedia function a use strict var c d a documentelement e d f',
 'copyright rsa data security inc their notice is reproduced below in its entirety',
 'copyright date date date by paul vixie n nall rights reserved n ndistribute freely except don t remove my name from the source or documentation don t take credit for my work mark your changes don t get me blamed for your possible bugs don t alter or remove this notice may be sold if buildab',
 'copyright flag bim japanese print flags bim color halftone settingsh ff ORG ff z bim color transfer settingsp',
 'copyright date name',
 'copyright copyright pk agent ORG ORG ORG ORG ORG type copy startbyte copy endbyte is enabled values copyright copyrightsymbol date by author professional identification url xcd e e ae

In [82]:
clf = OneVsRestClassifier(SVC(probability=True, C=25))
names_and_orgs = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True, replace_names=True, replace_orgs=True) ## 0.26

Number of missclassifications in class 0:  26.0 out of a total sample of:  16377.0  - about  0.16 % of the class was missclassified
Number of missclassifications in class 1:  44.0 out of a total sample of:  5393.0  - about  0.82 % of the class was missclassified


In [83]:
only_names[2]

[(1,
  244,
  '��\tNU��!nep��g��[\x1fl��pj\x043K��Y����\x12y+ 4����y%��f\x083K��Y����\x12f��0����y%��VW{\x12��,af\t3K��Y��\x12����0������\x14af����5 ��\x19��9��������4v} ����\x1e��&����jC��r=\x1a��>7P'),
 (1,
  292,
  "Copyright Flag\x018BIM' Japanese Print Flags 8BIM\x03��\x17Color Halftone SettingsH/ff\x01lff /ff\x01������ 2\x01Z 5\x01- 8BIM\x03��\x17Color Transfer Settingsp��������������������������������������������\x03����������������������������������������������\x03����������������������������������������������\x03������������������������"),
 (1, 295, 'Copyright 2003 Name'),
 (1,
  637,
  "�� J��T����R7��T\x06o��������Jvow\x1er\ne��m��HA\x04D��������I��t��/����\x0c����Tt|��p\x07��������d\x05V��#��\x7f\x07������p����GM��4B����\x03��yD���� ����i��v\x19ST����Me����������\x13��S��q\x1cO��Gm[qN��mZ����Q+������eQ+|\x12.����5��\x19��8[\x160������Z��L��w��%��\x08��Ll<\x1aF\x1d��\x19Rbf��\x1br��b&9������Z[V��', ����>6��b0F��[f����a���� ������\x148����jr(G����Tg��O����D/������[N/h��\x01V�

In [84]:
clf = OneVsRestClassifier(SVC(probability=True, C=25))
only_names_99 = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.99, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True, replace_names=True) ## 0.26

Number of missclassifications in class 0:  4.0 out of a total sample of:  16377.0  - about  0.02 % of the class was missclassified
Number of missclassifications in class 1:  194.0 out of a total sample of:  5393.0  - about  3.6 % of the class was missclassified


In [86]:
only_names_99[2]

[(1,
  11786,
  'Copyright (c) [year] [copyright holders] The Universal Permissive License (UPL), Version 1.0'),
 (1,
  13458,
  '(c) the European Community 2007\\n\\nThis European Union Public Licence (the \\"EUPL\\") applies to the Work or Software (as defined below) which is provided under the terms of this Licence. Any use of the Work, other than as authorised under this Licence is prohibited (to the extent such use is covered'),
 (1, 17694, 'Copyright © . All rights reserved.'),
 (1, 17927, 'Copyright 2003 Name')]

## Language Detection

In [91]:
import fasttext
model = fasttext.load_model("../lid.176.ftz")
sentence = "This is an example sentence in English."
label, score = model.predict(sentence, k=1)

The sentence is in English.
The confidence score is [0.95017874]




In [147]:
np.set_printoptions(threshold=np.inf)

In [221]:
languages = np.array(list(range(len(X))), dtype=object)
scores = np.array(list(range(len(X))), dtype=object)

for index, sentence in enumerate(preprocess_function(X, remove_endlines=True, remove_whitespaces=True, 
                                                    replace_emails=True, replace_dates=True, remove_numbers=True,
                                                    remove_punctuation=True, replace_copyright_symbol_v2=True,)):
    label, score = model.predict(sentence, k=1)
    languages[index] = label[0].split('__label__')[1]
    scores[index] = score[0]

for index, sentence in enumerate(preprocess_function(X, remove_endlines=True, remove_whitespaces=True, 
                                                    replace_emails=True, replace_dates=True, remove_numbers=True,
                                                    remove_punctuation=True, replace_copyright_symbol_v2=True,
                                                    lower=True)):
    if languages[index] != 'en':
        if scores[index] > 0.8:
            print(index, sentence, scores[index])

303 copyright copyright symbol date suse linux products gmbh maciej warnecki maciekw gmail com date wadim dziedzic email  0.8305246233940125
304 copyright copyright symbol date suse linux products gmbh s maciej warnecki maciekw gmail com date wadim dziedzic email  0.8305246233940125
453 copyright copyright symbol date orange author drozdz bartlomiej email  0.9019572734832764
607 copyright date date ion gaztanaga 0.859765350818634
628 copyright copyright symbol date louis dionne 0.9576355218887329
687 copyright copyright symbol date date louis dionne 0.9576355218887329
730 copyright date date ion gaztanaga 0.9074114561080933
804 copyright date date ion gaztanaga 0.9074114561080933
811 copyright date date ion gaztanaga 0.9074114561080933
942 copyright date ion gaztanaga 0.9074114561080933
1153 copyright date maarten keijzer 0.8483233451843262
1357 copyright thijs van den berg date  0.8920164108276367
1408 copyright date giel van schijndel 0.9443939924240112
1418 copyright date maciej pie

In [167]:
preprocessed_X = preprocess_function(X, replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                        remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                        replace_emails=True, replace_names=True)

In [176]:
preprocess_function(X)[453]

'Copyright (C) 2020 Orange Author: Drozdz Bartlomiej <bartlomiej.drozdz@orange.com>'

In [175]:
preprocessed_X[453]

'copyright copyrightsymbol date orange author drozdz PERSON email'

In [178]:
model.predict(preprocess_function(X)[453], k=1)

(('__label__pl',), array([0.76333958]))

In [179]:
model.predict(preprocess_function(X, remove_numbers=True)[453], k=1)

(('__label__pl',), array([0.76333958]))

In [180]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True)[453], k=1)

(('__label__pl',), array([0.76333958]))

In [181]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True, replace_emails=True)[453], k=1)

(('__label__pl',), array([0.45504943]))

In [183]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, replace_copyright_symbol_v2=True)[453], k=1)

(('__label__pl',), array([0.72843605]))

In [185]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, replace_copyright_symbols=True)[453], k=1)

(('__label__pl',), array([0.72843605]))

In [187]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, remove_whitespaces=True, remove_endlines=True)[453], k=1)

(('__label__pl',), array([0.45504943]))

In [188]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, remove_whitespaces=True, 
                                     remove_endlines=True, remove_punctuation=True)[453], k=1)

(('__label__pl',), array([0.89805561]))

In [197]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True)[453], k=1)

(('__label__es',), array([0.26881203]))

In [199]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True)[453], k=1)

(('__label__pl',), array([0.29756558]))

In [216]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True,)[453], k=1)

(('__label__es',), array([0.26881203]))

In [217]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbols=True)[453], k=1)

(('__label__pl',), array([0.28540385]))

In [218]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbol_v2=True)[453], k=1)

(('__label__en',), array([0.24512441]))

In [189]:
preprocess_function(X)[600]

'Copyright (c) Alexander Zaitsev &lt;zamazan4ik@gmail.com&gt;, 2016'

In [196]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True, remove_endlines=True)[600], k=1)

(('__label__en',), array([0.28598779]))

In [200]:
model.predict(preprocess_function(X, remove_numbers=True, replace_dates=True,
                                     replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True)[600], k=1)

(('__label__en',), array([0.36918202]))

In [201]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True)[600], k=1)

(('__label__en',), array([0.40733448]))

In [202]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     remove_numbers=True)[600], k=1)

(('__label__en',), array([0.39119032]))

In [203]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True)[600], k=1)

(('__label__en',), array([0.38340202]))

In [220]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbols=True)[600], k=1)

(('__label__en',), array([0.34574115]))

In [219]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbol_v2=True)[600], k=1)

(('__label__en',), array([0.381634]))

In [204]:
preprocess_function(X)[700]

'(C) Copyright Synge Todo 2003'

In [207]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,)[700], k=1)

(('__label__es',), array([0.24165182]))

In [208]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     remove_numbers=True)[700], k=1)

(('__label__es',), array([0.24165182]))

In [209]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True)[700], k=1)

(('__label__en',), array([0.21101804]))

In [210]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, lemmatize=True)[700], k=1)

(('__label__en',), array([0.21101804]))

In [212]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbols=True)[700], k=1)

(('__label__en',), array([0.19021918]))

In [213]:
model.predict(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                     remove_endlines=True, lower=True,
                                     replace_dates=True, replace_copyright_symbol_v2=True)[700], k=1)

(('__label__en',), array([0.22986507]))

In [222]:
languages = np.array(list(range(len(X))), dtype=object)
scores = np.array(list(range(len(X))), dtype=object)

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    label, score = model.predict(sentence, k=1)
    languages[index] = label[0].split('__label__')[1]
    scores[index] = score[0]

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    if languages[index] != 'en':
        if scores[index] > 0.8:
            print(index, sentence, scores[index])

1761 copyright louis dionne date  0.8385561108589172
1763 copyright louis dionne date  0.8385561108589172
3496 copyright date , a kaliforniai egyetem kormã¡nyzã³ja 0.8509538173675537
4192 copyright copyright symbol fsf. wszystkie prawa zastrzezone 0.8541680574417114
4660 copyright louis dionne, date - date  0.80096834897995
5614 copyright louis dionne date  0.8385561108589172
5615 copyright louis dionne date  0.8385561108589172
7675 copyright date baptiste lepilleur 0.8360307216644287
7678 copyright date baptiste lepilleur 0.8360307216644287
7682 copyright date baptiste lepilleur 0.8360307216644287
7687 copyright date baptiste lepilleur 0.8360307216644287
7716 copyright copyright symbol date bundesamt für sicherheit in der informationstechnik 0.8937627673149109
11907  copyright symbol de jouir de l'œuvre tel que permis par la lal (liberté de copier, diffuser, modifier) implique pour chacun la responsabilité de ses propres faits. 0.939477801322937
11908  copyright symbol posés par la la

In [223]:
languages = np.array(list(range(len(X))), dtype=object)
scores = np.array(list(range(len(X))), dtype=object)

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    label, score = model.predict(sentence, k=1)
    languages[index] = label[0].split('__label__')[1]
    scores[index] = score[0]

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    if languages[index] != 'en':
        if scores[index] > 0.5:
            print(index, sentence, scores[index])

79 copyright): new json hpp version 0.5109319090843201
607 copyright date - date ion gaztanaga 0.5460236072540283
628 copyright copyright symbol date louis dionne 0.5822166800498962
687 copyright copyright symbol date - date louis dionne 0.5374045968055725
730 copyright date - date ion gaztanaga 0.5460236072540283
804 copyright date - date ion gaztanaga 0.5460236072540283
811 copyright date - date ion gaztanaga 0.5460236072540283
942 copyright date ion gaztanaga 0.6646783351898193
1153 copyright date maarten keijzer 0.7063458561897278
1357 copyright thijs van den berg date  0.7731266617774963
1408 copyright date giel van schijndel 0.7564087510108948
1418 copyright date maciej piechotka authors: maciej piechotka 0.6345721483230591
1577 copyright date - date ion gaztanaga 0.5460236072540283
1622 copyright copyright symbol date tor brede vekterli 0.6641188263893127
1639 copyright date - date ion gaztanaga 0.5460236072540283
1761 copyright louis dionne date  0.8385561108589172
1763 copyrig

In [224]:
languages = np.array(list(range(len(X))), dtype=object)
scores = np.array(list(range(len(X))), dtype=object)

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    label, score = model.predict(sentence, k=1)
    languages[index] = label[0].split('__label__')[1]
    scores[index] = score[0]

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True)):
    if languages[index] != 'en':
        print(index, sentence, scores[index])

2 copyright_list src/cli/fo_folder src/cli/fo_nomos_license_list src/cli/fo_wrapper.php src/cli/fossjobs src/cli/fossjobs.1 src/cli/fossjobs.html src/cli/fossjobs.txt src/cli/schema-export src/cli/tests/cli-xunit.xml src/cli/testupdate2.php src/cli/fo_import_licenses src/cli/fo_usergroup src/cli/fossu 0.24052397906780243
14  copyright symbol date siemens ag 0.26797372102737427
15  copyright symbol date siemens ag 0.26797372102737427
27  copyright symbol date - date siemens ag 0.2198590785264969
79 copyright): new json hpp version 0.5109319090843201
80 copyright): enable recursion test 0.162434920668602
145  copyright symbol date , date siemens ag 0.2198590785264969
147 copyright/mod_deps ./src/copyright/ copy ./src/delagent/mod_deps ./src/delagent/ copy ./src/mimetype/mod_deps ./src/mimetype/ copy ./src/nomos/mod_deps ./src/nomos/ copy ./src/ojo/mod_deps ./src/ojo/ copy ./src/pkgagent/mod_deps ./src/pkgagent/ copy ./src/scancode/mod_deps ./src/scancode/ copy ./src 0.22633352875709534
3

In [225]:
languages = np.array(list(range(len(X))), dtype=object)
scores = np.array(list(range(len(X))), dtype=object)

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True,
                                                        remove_punctuation=True)):
    label, score = model.predict(sentence, k=1)
    languages[index] = label[0].split('__label__')[1]
    scores[index] = score[0]

for index, sentence in enumerate(preprocess_function(X, replace_emails=True, remove_whitespaces=True,
                                                        remove_endlines=True, lower=True,
                                                        replace_dates=True, replace_copyright_symbol_v2=True,
                                                        remove_punctuation=True)):
    if languages[index] != 'en':
        print(index, sentence, scores[index])

14  copyright symbol date siemens ag 0.26797372102737427
15  copyright symbol date siemens ag 0.26797372102737427
27  copyright symbol date date siemens ag 0.2198590785264969
28 copyright json output date 55f feat monk json output 2a397af feat nomos json output date 978 feat obligations extend datamodel and obligation management 0.20161619782447815
79 copyright new json hpp version 0.5036030411720276
145  copyright symbol date date siemens ag 0.2198590785264969
174 copyright copyright symbol date date world wide web consortium massachusetts institute of technology institut national de recherche en informatique et en automatique keio university all rights reserved http www w3 org consortium legal  0.27273884415626526
303 copyright copyright symbol date suse linux products gmbh maciej warnecki email date wadim dziedzic email  0.30346471071243286
304 copyright copyright symbol date suse linux products gmbh s maciej warnecki email date wadim dziedzic email  0.30346471071243286
311 copyrigh