# TF-IDF Testing in a clean notebook

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tqdm import tqdm
import string
import scipy

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jimbo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
data_0 = pd.read_csv('../datasets/fossology-master-corrected.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../datasets/kubernetes-master-corrected.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../datasets/tensorflow-master-corrected.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../datasets/fossology-provided-1-corrected.csv')

X_3 = data_3['copyright']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

Class 0 Percentage:  0.7522737712448323
Class 1 Percentage:  0.24772622875516767


In [4]:
def aggregate_reports(reports, print_aggregates=True):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    if print_aggregates:
        print("## Precision")
        print(dfs[0].to_markdown())
        print("## Recall")
        print(dfs[1].to_markdown())
        print("## F1-score")
        print(dfs[2].to_markdown())
    else:
        return dfs[0], dfs[1], dfs[2]

In [5]:
def get_missclassified_rows(X, y_true, y_pred):
    if type(y_true) != list:
        y_true = y_true.tolist()
    if type(y_pred) != list:
        y_pred = y_pred.tolist()
    if type(X) != list:
        X = X.tolist()
    missclassified_rows = []
    for i in range(len(y_true)):
        if y_true[i] != y_pred[i]:
            missclassified_rows.append(i)
    return [(y_pred[i], X[i]) for i in missclassified_rows]

## TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.990775 | 0.96461  | precision |
| 1    | 0.997835 | 0.956522 | precision |
| 2    | 1        | 0.94     | precision |
| 3    | 0.999241 | 0.99375  | precision |
| 4    | 0.997983 | 0.990392 | precision |
| Mean | 0.997167 | 0.969055 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98804  | 0.972553 | recall   |
| 1    | 0.98927  | 0.990991 | recall   |
| 2    | 0.96129  | 1        | recall   |
| 3    | 0.999241 | 0.99375  | recall   |
| 4    | 0.996825 | 0.993881 | recall   |
| Mean | 0.986933 | 0.990235 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989406 | 0.968565 | f1-score |
| 1    | 0.993534 | 0.973451 | f1-score |
| 2    | 0.980263 | 0.969072 | f1-score |
| 3    | 0.999241 | 0.99375  | f1-score |
| 4    | 0.997403 | 0.992133 | f1

In [8]:
# Save the data to csv file
# test = pd.DataFrame(columns=['copyright', 'falsePositive', 'pred'])
# test['falsePositive'] = y
# test['pred'] = y_pred_4_classification
# test['copyright'] = X
# test.to_csv('test.csv', index=False)

In [10]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [24]:
def preprocess_function(sentences, lower=False, replace_copyright_symbols=False,
                        replace_dates=False, remove_numbers=False, remove_punctuation=False,
                        remove_special_characters=False, remove_whitespaces=False, remove_specials_weird=False,
                        remove_stopwords=False, replace_emails=False, replace_names=False, replace_orgs=False,
                        tokenize=False, lemmatize=False, glove=False):
    if type(sentences) is not list:
        sentences = sentences.to_list()
    if replace_dates:
        if glove:
            sentences = [re.sub(r'\d{4}', ' <DATE> ', sentence) for sentence in sentences]
        else:
            sentences = [re.sub(r'\d{4}', ' DATE ', sentence) for sentence in sentences]
    if remove_numbers:
        sentences = [re.sub(r'\d+', ' ', sentence) for sentence in sentences]
    if replace_copyright_symbols:
        if glove:
            symbol_text = ' <COPYRIGHT SYMBOL> '
        else:
            symbol_text = ' COPYRIGHTSYMBOL '
        sentences = [re.sub(r'©', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(c\)', symbol_text, sentence) for sentence in sentences]
        sentences = [re.sub(r'\(C\)', symbol_text, sentence) for sentence in sentences]
    if replace_emails:
        if glove:
            email_text = ' <EMAIL> '
        else:
            email_text = ' EMAIL '
        sentences = [re.sub("""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", email_text, sentence) for sentence in sentences]
    if replace_names:
        pass # TODO: Implement this using NER if needed
    if replace_orgs:
        pass # TODO: Implement this using NER if needed
    if remove_punctuation:
        sentences = [re.sub(r'[^\w\s]', ' ', sentence) for sentence in sentences]
    if remove_special_characters:
        sentences = [re.sub(r'[^a-zA-Z0-9]', ' ', sentence) for sentence in sentences]
    if remove_specials_weird:
        sentences = [re.sub(r'[^a-zA-Z0-9]', '', sentence) for sentence in sentences]
    if lower:
        sentences = [sentence.lower() for sentence in sentences]
    if remove_stopwords:
        sentences = [re.sub(r'\b(?:{})\b'.format('|'.join(stopwords.words('english'))), ' ', sentence)
                     for sentence in sentences]
    if remove_whitespaces:
        sentences = [re.sub(r' {2,}', ' ', sentence) for sentence in sentences]
    if tokenize:
        sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in sentences]
    return sentences

In [7]:
# Train to save space in code
def train(svm, vectorizer, threshold, preprocess_function,**kwargs):
    X_train_tfidf = vectorizer.fit_transform(preprocess_function(X_train, **kwargs))
    X_test_tfidf = vectorizer.transform(preprocess_function(X_test, **kwargs))
    X_1_tfidf = vectorizer.transform(preprocess_function(X_1, **kwargs))
    X_2_tfidf = vectorizer.transform(preprocess_function(X_2, **kwargs))
    X_3_tfidf = vectorizer.transform(preprocess_function(X_3, **kwargs))
    X_tfidf = vectorizer.transform(preprocess_function(X, **kwargs))
    svm.fit(X_train_tfidf, y_train)
    if svm.probability:
        y_pred = svm.predict_proba(X_test_tfidf)
        y_pred_1 = svm.predict_proba(X_1_tfidf)
        y_pred_2 = svm.predict_proba(X_2_tfidf)
        y_pred_3 = svm.predict_proba(X_3_tfidf)
        y_pred_4 = svm.predict_proba(X_tfidf)
        if threshold is None:
            y_pred_classification = np.argmax(y_pred, axis=1)
            y_pred_1_classification = np.argmax(y_pred_1, axis=1)
            y_pred_2_classification = np.argmax(y_pred_2, axis=1)
            y_pred_3_classification = np.argmax(y_pred_3, axis=1)
            y_pred_4_classification = np.argmax(y_pred_4, axis=1)
        else:
            y_pred_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred]
            y_pred_1_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_1]
            y_pred_2_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_2]
            y_pred_3_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_3]
            y_pred_4_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_4]
    else:
        y_pred_classification = svm.predict(X_test_tfidf)
        y_pred_1_classification = svm.predict(X_1_tfidf)
        y_pred_2_classification = svm.predict(X_2_tfidf)
        y_pred_3_classification = svm.predict(X_3_tfidf)
        y_pred_4_classification = svm.predict(X_tfidf)
    report = classification_report(y_test, y_pred_classification, output_dict=True)
    report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
    report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
    report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
    report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
    #aggregate_reports([report, report_1, report_2, report_3, report_4])
    print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
    print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')
    return svm, vectorizer

In [34]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, lower=True)

Number of missclassifications in class 0:  56 out of a total sample of:  16377  - about  0.34 % of the class was missclassified
Number of missclassifications in class 1:  32 out of a total sample of:  5393  - about  0.59 % of the class was missclassified


In [32]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_dates=True)

Number of missclassifications in class 0:  44 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  31 out of a total sample of:  5393  - about  0.57 % of the class was missclassified


In [33]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_copyright_symbols=True)

Number of missclassifications in class 0:  41 out of a total sample of:  16377  - about  0.25 % of the class was missclassified
Number of missclassifications in class 1:  38 out of a total sample of:  5393  - about  0.7 % of the class was missclassified


In [35]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
remove_numbers=True)

Number of missclassifications in class 0:  90 out of a total sample of:  16377  - about  0.55 % of the class was missclassified
Number of missclassifications in class 1:  60 out of a total sample of:  5393  - about  1.11 % of the class was missclassified


In [37]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
remove_numbers=True, remove_whitespaces=True,)

Number of missclassifications in class 0:  90 out of a total sample of:  16377  - about  0.55 % of the class was missclassified
Number of missclassifications in class 1:  58 out of a total sample of:  5393  - about  1.08 % of the class was missclassified


In [36]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
remove_punctuation=True)

Number of missclassifications in class 0:  52 out of a total sample of:  16377  - about  0.32 % of the class was missclassified
Number of missclassifications in class 1:  33 out of a total sample of:  5393  - about  0.61 % of the class was missclassified


In [38]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
remove_special_characters=True)

Number of missclassifications in class 0:  52 out of a total sample of:  16377  - about  0.32 % of the class was missclassified
Number of missclassifications in class 1:  34 out of a total sample of:  5393  - about  0.63 % of the class was missclassified


In [39]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
remove_special_characters=True, remove_whitespaces=True)

Number of missclassifications in class 0:  50 out of a total sample of:  16377  - about  0.31 % of the class was missclassified
Number of missclassifications in class 1:  35 out of a total sample of:  5393  - about  0.65 % of the class was missclassified


In [None]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function,
remove_specials_weird=True)

Number of missclassifications in class 0:  0 out of a total sample of:  16377  - about  0.0 % of the class was missclassified
Number of missclassifications in class 1:  970 out of a total sample of:  5393  - about  17.99 % of the class was missclassified


In [30]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function,
remove_specials_weird=True, remove_whitespaces=True)

Number of missclassifications in class 0:  0 out of a total sample of:  16377  - about  0.0 % of the class was missclassified
Number of missclassifications in class 1:  970 out of a total sample of:  5393  - about  17.99 % of the class was missclassified


In [40]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_dates=True, remove_numbers=True)

Number of missclassifications in class 0:  47 out of a total sample of:  16377  - about  0.29 % of the class was missclassified
Number of missclassifications in class 1:  36 out of a total sample of:  5393  - about  0.67 % of the class was missclassified


In [41]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_dates=True, remove_numbers=True, remove_whitespaces=True,)

Number of missclassifications in class 0:  45 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  36 out of a total sample of:  5393  - about  0.67 % of the class was missclassified


In [42]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True)

Number of missclassifications in class 0:  43 out of a total sample of:  16377  - about  0.26 % of the class was missclassified
Number of missclassifications in class 1:  44 out of a total sample of:  5393  - about  0.82 % of the class was missclassified


In [43]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True)

Number of missclassifications in class 0:  47 out of a total sample of:  16377  - about  0.29 % of the class was missclassified
Number of missclassifications in class 1:  44 out of a total sample of:  5393  - about  0.82 % of the class was missclassified


In [44]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_dates=True, remove_numbers=True, replace_copyright_symbols=True, lower=True)

Number of missclassifications in class 0:  43 out of a total sample of:  16377  - about  0.26 % of the class was missclassified
Number of missclassifications in class 1:  44 out of a total sample of:  5393  - about  0.82 % of the class was missclassified


In [45]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True)

Number of missclassifications in class 0:  43 out of a total sample of:  16377  - about  0.26 % of the class was missclassified
Number of missclassifications in class 1:  44 out of a total sample of:  5393  - about  0.82 % of the class was missclassified


In [46]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True, remove_special_characters=True)

Number of missclassifications in class 0:  46 out of a total sample of:  16377  - about  0.28 % of the class was missclassified
Number of missclassifications in class 1:  49 out of a total sample of:  5393  - about  0.91 % of the class was missclassified


In [16]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, 
replace_dates=True, remove_numbers=True, lower=True)

Number of missclassifications in class 0:  47 out of a total sample of:  16377  - about  0.29 % of the class was missclassified
Number of missclassifications in class 1:  36 out of a total sample of:  5393  - about  0.67 % of the class was missclassified


In [47]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, remove_special_characters=True)

Number of missclassifications in class 0:  45 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  46 out of a total sample of:  5393  - about  0.85 % of the class was missclassified


In [48]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, remove_special_characters=True, lower=True)

Number of missclassifications in class 0:  46 out of a total sample of:  16377  - about  0.28 % of the class was missclassified
Number of missclassifications in class 1:  46 out of a total sample of:  5393  - about  0.85 % of the class was missclassified


In [19]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function, replace_dates=True, 
remove_numbers=True, remove_special_characters=True, lower=True, replace_copyright_symbols=True)

Number of missclassifications in class 0:  45 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  49 out of a total sample of:  5393  - about  0.91 % of the class was missclassified


In [20]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function,
remove_special_characters=True, lower=True)

Number of missclassifications in class 0:  51 out of a total sample of:  16377  - about  0.31 % of the class was missclassified
Number of missclassifications in class 1:  34 out of a total sample of:  5393  - about  0.63 % of the class was missclassified


In [21]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function,
remove_special_characters=True, lower=True, replace_copyright_symbols=True)

Number of missclassifications in class 0:  45 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  44 out of a total sample of:  5393  - about  0.82 % of the class was missclassified


In [22]:
train(SVC(probability=True), TfidfVectorizer(), None, preprocess_function,
remove_special_characters=True, lower=True, replace_copyright_symbols=True,
 replace_dates=True, remove_numbers=True,)

Number of missclassifications in class 0:  46 out of a total sample of:  16377  - about  0.28 % of the class was missclassified
Number of missclassifications in class 1:  49 out of a total sample of:  5393  - about  0.91 % of the class was missclassified


## Best - Preprocessing method(s)

In [55]:
train(SVC(probability=True), TfidfVectorizer(), 0.9, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  16 out of a total sample of:  16377  - about  0.1 % of the class was missclassified
Number of missclassifications in class 1:  120 out of a total sample of:  5393  - about  2.23 % of the class was missclassified


In [57]:
train(SVC(probability=True), TfidfVectorizer(), 0.95, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  12 out of a total sample of:  16377  - about  0.07 % of the class was missclassified
Number of missclassifications in class 1:  169 out of a total sample of:  5393  - about  3.13 % of the class was missclassified


In [58]:
train(SVC(probability=True), TfidfVectorizer(), 0.99, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  5 out of a total sample of:  16377  - about  0.03 % of the class was missclassified
Number of missclassifications in class 1:  420 out of a total sample of:  5393  - about  7.79 % of the class was missclassified


In [56]:
train(SVC(probability=True), TfidfVectorizer(), 0.9, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True, remove_special_characters=True) ## 0.28

Number of missclassifications in class 0:  17 out of a total sample of:  16377  - about  0.1 % of the class was missclassified
Number of missclassifications in class 1:  130 out of a total sample of:  5393  - about  2.41 % of the class was missclassified


In [59]:
train(SVC(probability=True), TfidfVectorizer(), 0.95, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True, remove_special_characters=True) ## 0.28

Number of missclassifications in class 0:  12 out of a total sample of:  16377  - about  0.07 % of the class was missclassified
Number of missclassifications in class 1:  186 out of a total sample of:  5393  - about  3.45 % of the class was missclassified


In [60]:
train(SVC(probability=True), TfidfVectorizer(), 0.99, preprocess_function, replace_dates=True, 
remove_numbers=True, replace_copyright_symbols=True, remove_whitespaces=True, lower=True, remove_special_characters=True) ## 0.28

Number of missclassifications in class 0:  5 out of a total sample of:  16377  - about  0.03 % of the class was missclassified
Number of missclassifications in class 1:  428 out of a total sample of:  5393  - about  7.94 % of the class was missclassified


## Grid search to find the best TF-IDF paramter on the best preprocessing function

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'tfidf__max_df': [0.5, 0.75, 0.85, 0.9, 0.95, 0.98, 0.99, 1.0],
    'tfidf__min_df': [1, 2, 3, 4, 5, 10],
    'tfidf__max_features': [None, 1000, 2000, 3000, 4000, 5000, 10000, 15000, 20000],
    'tfidf__binary': [True, False],
    'tfidf__dtype': [np.float64, np.float32, np.int64],
    'tfidf__norm': ['l2', 'l1', None],
    'tfidf__use_idf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=2)

grid_search.fit(preprocess_function(X_train, replace_dates=True, remove_numbers=True,
                replace_copyright_symbols=True, remove_whitespaces=True, lower=True),
                y_train)

print('Best score:', grid_search.best_score_)
print('Best parameters:', grid_search.best_params_)

In [None]:
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV 
from skopt.space import Real, Integer, Categorical
from skopt.space.transformers import CategoricalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

param_space = {
    'tfidf__ngram_range': Categorical([(1, 1), (1, 2), (1, 3), (1, 4)]),
    'tfidf__max_df': Real(0.5, 1.0),
    'tfidf__min_df': Integer(1, 10),
    'tfidf__max_features': Categorical([None, 1000, 2000, 3000, 4000, 5000, 10000, 15000, 20000]),
    'tfidf__binary': Categorical([True, False]),
    'tfidf__dtype': Categorical([np.float64, np.float32, np.int64]),
    'tfidf__norm': Categorical(['l2', 'l1', None]),  # Added a comma to separate the options properly.
    'tfidf__use_idf': Categorical([True, False]),
    'tfidf__smooth_idf': Categorical([True, False]),
    'tfidf__sublinear_tf': Categorical([True, False]),
}

bayes_search = BayesSearchCV(pipeline, param_space, scoring='f1', cv=5, n_jobs=-1, verbose=2)

bayes_search.fit(preprocess_function(X_train,
                                     replace_dates=True,
                                     remove_numbers=True,
                                     replace_copyright_symbols=True,
                                     remove_whitespaces=True,
                                     lower=True),
                 y_train)

print('Best score:', bayes_search.best_score_)
print('Best parameters:', bayes_search.best_params_)

In [112]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import numpy as np

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

param_dist = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'tfidf__max_df': np.linspace(0.5, 1.0, 100),  # Randomly sample from a range
    'tfidf__min_df': np.random.randint(1, 11, 10),  # Randomly sample from 1 to 10 with 10 samples
    'tfidf__max_features': [None, 1000, 2000, 3000, 4000, 5000, 10000, 15000, 20000],
    'tfidf__binary': [True, False],
    'tfidf__dtype': [np.float64, np.float32, np.float16], 
    'tfidf__norm': ['l2', 'l1', None],
    'tfidf__use_idf': [True, False],
    'tfidf__smooth_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=1000, scoring='f1',
                                   cv=5, n_jobs=-1, verbose=2, random_state=42)

random_search.fit(preprocess_function(X_train, replace_dates=True, remove_numbers=True,
                                      replace_copyright_symbols=True, remove_whitespaces=True, lower=True),
                  y_train)

print('Best score:', random_search.best_score_)
print('Best parameters:', random_search.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] END tfidf__binary=False, tfidf__dtype=<class 'numpy.float16'>, tfidf__max_df=0.8535353535353536, tfidf__max_features=1000, tfidf__min_df=7, tfidf__ngram_range=(1, 3), tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True; total time=   0.7s
[CV] END tfidf__binary=False, tfidf__dtype=<class 'numpy.float16'>, tfidf__max_df=0.8535353535353536, tfidf__max_features=1000, tfidf__min_df=7, tfidf__ngram_range=(1, 3), tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True; total time=   0.7s
[CV] END tfidf__binary=False, tfidf__dtype=<class 'numpy.float16'>, tfidf__max_df=0.8535353535353536, tfidf__max_features=1000, tfidf__min_df=7, tfidf__ngram_range=(1, 3), tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True; total time=   0.7s
[CV] END tfidf__binary=False, tfidf__dtype=<class 'numpy.float16'>, tfidf__max_df=0.85353535353535

1695 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1695 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jimbo/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jimbo/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/jimbo/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/jimbo/.local/lib/python3.8/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*

Best score: 0.9782228855623503
Best parameters: {'tfidf__use_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__smooth_idf': True, 'tfidf__norm': 'l2', 'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 2, 'tfidf__max_features': None, 'tfidf__max_df': 0.803030303030303, 'tfidf__dtype': <class 'numpy.float32'>, 'tfidf__binary': True}


In [14]:
train(SVC(probability=True), TfidfVectorizer(use_idf=True, sublinear_tf=False, smooth_idf=True, norm='l2',
                                            ngram_range=(1, 2), min_df=2, max_features=None, max_df=803030303030303,
                                            dtype=np.float32, binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  41 out of a total sample of:  16377  - about  0.25 % of the class was missclassified
Number of missclassifications in class 1:  27 out of a total sample of:  5393  - about  0.5 % of the class was missclassified


In [15]:
train(SVC(probability=True), TfidfVectorizer(use_idf=True, sublinear_tf=False, smooth_idf=True, norm='l2',
                                            ngram_range=(1, 3), min_df=2, max_features=None, max_df=803030303030303,
                                            dtype=np.float32, binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  42 out of a total sample of:  16377  - about  0.26 % of the class was missclassified
Number of missclassifications in class 1:  28 out of a total sample of:  5393  - about  0.52 % of the class was missclassified


In [13]:
train(SVC(probability=True), TfidfVectorizer(use_idf=True, sublinear_tf=False, smooth_idf=True, norm='l2',
                                            ngram_range=(1, 3), min_df=2, max_features=None, max_df=803030303030303,
                                            dtype=np.float32), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  45 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  29 out of a total sample of:  5393  - about  0.54 % of the class was missclassified


In [16]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2)), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  37 out of a total sample of:  16377  - about  0.23 % of the class was missclassified
Number of missclassifications in class 1:  33 out of a total sample of:  5393  - about  0.61 % of the class was missclassified


In [17]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 3)), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  33 out of a total sample of:  16377  - about  0.2 % of the class was missclassified
Number of missclassifications in class 1:  34 out of a total sample of:  5393  - about  0.63 % of the class was missclassified


In [33]:
svm_best = train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  27 out of a total sample of:  16377  - about  0.16 % of the class was missclassified
Number of missclassifications in class 1:  28 out of a total sample of:  5393  - about  0.52 % of the class was missclassified


In [22]:
svm_best_2 = train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True) ## 0.26

Number of missclassifications in class 0:  29 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  27 out of a total sample of:  5393  - about  0.5 % of the class was missclassified


In [25]:
svm_best_3 = train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  27 out of a total sample of:  16377  - about  0.16 % of the class was missclassified
Number of missclassifications in class 1:  34 out of a total sample of:  5393  - about  0.63 % of the class was missclassified


### Fixed the Replace emails - previously the best was 0.16%

In [12]:
svm_best_3 = train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  30 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  33 out of a total sample of:  5393  - about  0.61 % of the class was missclassified


### Continue with non Fixed emails

In [24]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_stopwords=True) ## 0.26

Number of missclassifications in class 0:  30 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  33 out of a total sample of:  5393  - about  0.61 % of the class was missclassified


(SVC(probability=True), TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [25]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, replace_emails=True, 
                                            remove_special_characters=True) ## 0.26

Number of missclassifications in class 0:  29 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  27 out of a total sample of:  5393  - about  0.5 % of the class was missclassified


(SVC(probability=True), TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [19]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 3), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  29 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  29 out of a total sample of:  5393  - about  0.54 % of the class was missclassified


In [20]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(2, 2)), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  65 out of a total sample of:  16377  - about  0.4 % of the class was missclassified
Number of missclassifications in class 1:  29 out of a total sample of:  5393  - about  0.54 % of the class was missclassified


In [21]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True, sublinear_tf=True), None,
                                            preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  27 out of a total sample of:  16377  - about  0.16 % of the class was missclassified
Number of missclassifications in class 1:  29 out of a total sample of:  5393  - about  0.54 % of the class was missclassified


In [22]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True, min_df=3), None,
                                            preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  44 out of a total sample of:  16377  - about  0.27 % of the class was missclassified
Number of missclassifications in class 1:  30 out of a total sample of:  5393  - about  0.56 % of the class was missclassified


In [23]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True, max_df=0.8), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  30 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  24 out of a total sample of:  5393  - about  0.45 % of the class was missclassified


In [24]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True, max_df=0.9), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  30 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  24 out of a total sample of:  5393  - about  0.45 % of the class was missclassified


In [26]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.9, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  17 out of a total sample of:  16377  - about  0.1 % of the class was missclassified
Number of missclassifications in class 1:  88 out of a total sample of:  5393  - about  1.63 % of the class was missclassified


In [27]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.95, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  14 out of a total sample of:  16377  - about  0.09 % of the class was missclassified
Number of missclassifications in class 1:  117 out of a total sample of:  5393  - about  2.17 % of the class was missclassified


In [28]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.99, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  5 out of a total sample of:  16377  - about  0.03 % of the class was missclassified
Number of missclassifications in class 1:  248 out of a total sample of:  5393  - about  4.6 % of the class was missclassified


In [29]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.995, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  4 out of a total sample of:  16377  - about  0.02 % of the class was missclassified
Number of missclassifications in class 1:  313 out of a total sample of:  5393  - about  5.8 % of the class was missclassified


In [30]:
train(SVC(probability=True), TfidfVectorizer(ngram_range=(1, 2), binary=True), 0.999, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True) ## 0.26

Number of missclassifications in class 0:  3 out of a total sample of:  16377  - about  0.02 % of the class was missclassified
Number of missclassifications in class 1:  449 out of a total sample of:  5393  - about  8.33 % of the class was missclassified


## Get the best SVM Parameters

In [21]:
train(SVC(), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  36 out of a total sample of:  16377  - about  0.22 % of the class was missclassified
Number of missclassifications in class 1:  26 out of a total sample of:  5393  - about  0.48 % of the class was missclassified


(SVC(), TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [22]:
train(SVC(probability=True, kernel='poly'), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  52 out of a total sample of:  16377  - about  0.32 % of the class was missclassified
Number of missclassifications in class 1:  23 out of a total sample of:  5393  - about  0.43 % of the class was missclassified


(SVC(kernel='poly', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [23]:
train(SVC(probability=True, kernel='linear'), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  35 out of a total sample of:  16377  - about  0.21 % of the class was missclassified
Number of missclassifications in class 1:  27 out of a total sample of:  5393  - about  0.5 % of the class was missclassified


(SVC(kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [24]:
train(SVC(probability=True, kernel='sigmoid'), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  54 out of a total sample of:  16377  - about  0.33 % of the class was missclassified
Number of missclassifications in class 1:  35 out of a total sample of:  5393  - about  0.65 % of the class was missclassified


(SVC(kernel='sigmoid', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [38]:
train(SVC(probability=True, C=0.1), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  333 out of a total sample of:  16377  - about  2.03 % of the class was missclassified
Number of missclassifications in class 1:  58 out of a total sample of:  5393  - about  1.08 % of the class was missclassified


(SVC(C=0.1, probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [39]:
train(SVC(probability=True, C=2), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  24 out of a total sample of:  16377  - about  0.15 % of the class was missclassified
Number of missclassifications in class 1:  25 out of a total sample of:  5393  - about  0.46 % of the class was missclassified


(SVC(C=2, probability=True), TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [40]:
train(SVC(probability=True, C=10), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  24 out of a total sample of:  16377  - about  0.15 % of the class was missclassified
Number of missclassifications in class 1:  27 out of a total sample of:  5393  - about  0.5 % of the class was missclassified


(SVC(C=10, probability=True), TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [41]:
train(SVC(probability=True, C=100), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  24 out of a total sample of:  16377  - about  0.15 % of the class was missclassified
Number of missclassifications in class 1:  26 out of a total sample of:  5393  - about  0.48 % of the class was missclassified


(SVC(C=100, probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [42]:
train(SVC(probability=True, C=1000), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  25 out of a total sample of:  16377  - about  0.15 % of the class was missclassified
Number of missclassifications in class 1:  26 out of a total sample of:  5393  - about  0.48 % of the class was missclassified


(SVC(C=1000, probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [43]:
train(SVC(probability=True, gamma='auto'), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  853 out of a total sample of:  16377  - about  5.21 % of the class was missclassified
Number of missclassifications in class 1:  41 out of a total sample of:  5393  - about  0.76 % of the class was missclassified


(SVC(gamma='auto', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [46]:
train(SVC(probability=True, kernel='linear', C=0.1), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  183 out of a total sample of:  16377  - about  1.12 % of the class was missclassified
Number of missclassifications in class 1:  97 out of a total sample of:  5393  - about  1.8 % of the class was missclassified


(SVC(C=0.1, kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [50]:
train(SVC(probability=True, kernel='linear', C=2), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  30 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  23 out of a total sample of:  5393  - about  0.43 % of the class was missclassified


(SVC(C=2, kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [49]:
train(SVC(probability=True, kernel='linear', C=10), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  29 out of a total sample of:  16377  - about  0.18 % of the class was missclassified
Number of missclassifications in class 1:  25 out of a total sample of:  5393  - about  0.46 % of the class was missclassified


(SVC(C=10, kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [47]:
train(SVC(probability=True, kernel='linear', C=100), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  34 out of a total sample of:  16377  - about  0.21 % of the class was missclassified
Number of missclassifications in class 1:  24 out of a total sample of:  5393  - about  0.45 % of the class was missclassified


(SVC(C=100, kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

In [48]:
train(SVC(probability=True, kernel='linear', C=1000), TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                            remove_whitespaces=True, lower=True, replace_emails=True) ## 0.26

Number of missclassifications in class 0:  52 out of a total sample of:  16377  - about  0.32 % of the class was missclassified
Number of missclassifications in class 1:  29 out of a total sample of:  5393  - about  0.54 % of the class was missclassified


(SVC(C=1000, kernel='linear', probability=True),
 TfidfVectorizer(binary=True, ngram_range=(1, 2)))

## LDA to get words associated with each class

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Create a CountVectorizer to convert text data into a bag-of-words representation
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(preprocess_function(X, lower=True, replace_copyright_symbols=True,
                                                 replace_dates=True, remove_numbers=True, 
                                                 remove_special_characters=True, remove_whitespaces=True,
                                                 replace_emails=True, remove_stopwords=True,))

# Instantiate and fit an LDA model
n_topics = len(set(y))  # One topic per class
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_vectorized)

# Get the most common words associated with each class
n_words = 20  # Number of words to display for each class
feature_names = vectorizer.get_feature_names_out()

# Group sentences by class
class_sentences = {}
for sentence, label in zip(X, y):
    if label not in class_sentences:
        class_sentences[label] = []
    class_sentences[label].append(sentence)

# Print the top words associated with each class
for class_label, class_index in zip(class_sentences.keys(), range(n_topics)):
    topic = lda.components_[class_index]
    top_words_idx = topic.argsort()[:-n_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Class '{class_label}': {', '.join(top_words)}")


Class '1': copyright, license, testdata, agent, filechecksum, fossology, copy, sha, use, tests, software, master, source, file, notice, rights, code, may, md, work
Class '0': date, copyright, copyrightsymbol, inc, software, free, foundation, reserved, com, rights, corporation, org, siemens, others, text, nathan, university, ag, lt, gt


## Sneaky test on Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = TfidfVectorizer(ngram_range=(1, 2), binary=True)
vectorizer.fit_transform(preprocess_function(X_train, replace_dates=True, remove_numbers=True,
                            replace_copyright_symbols=True, remove_whitespaces=True, lower=True,
                            replace_emails=True))

rf = RandomForestClassifier()
rf.fit(vectorizer.transform(preprocess_function(X_train, replace_dates=True, remove_numbers=True,
                            replace_copyright_symbols=True, remove_whitespaces=True, lower=True,
                            replace_emails=True)), y_train)

y_pred_rf = rf.predict(vectorizer.transform(preprocess_function(X, replace_dates=True, remove_numbers=True,
                            replace_copyright_symbols=True, remove_whitespaces=True, lower=True,
                            replace_emails=True)))
report = classification_report(y_test, y_pred_rf, output_dict=True)

print(aggregate_reports([report]))

print('Number of missclassifications in class 0: ', report['0']['support'] - round(report['0']['recall'] * report['0']['support']), 'out of a total sample of: ', report['0']['support'], ' - about ', round((1 - report['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report['1']['support'] - round(report['1']['recall'] * report['1']['support']), 'out of a total sample of: ', report['1']['support'], ' - about ', round((1 - report['1']['recall']) * 100, 2), '% of the class was missclassified')

ValueError: Found input variables with inconsistent numbers of samples: [4354, 21770]

# Feature extraction

In [None]:
# The authors of this paper: https://www.jstage.jst.go.jp/article/transinf/E103.D/12/E103.D_2020EDL8089/_pdf
# Used 5 features:
# Category Token Example
# Copyright-related-keyword: COPYRIGHT copyright, author
# Copyright-related-signal: SIGNAL c , (C), (c)
#Year YEAR 1991, 2002, 2013
#E-mail address EMAIL addr@email.com
#Others OTHER license, above

In [21]:
TOKENS = {
    "COPYRIGHT": "COPYRIGHT",
    "SIGNAL": "SIGNAL",
    "YEAR": "YEAR",
    "EMAIL": "EMAIL",
    "OTHER": "OTHER"
}

PATTERNS = {
    "COPYRIGHT": re.compile(r"(copyright|author|maintainer(s)*|owner(s)*|creator(s)*|holder(s)*|licensor(s)*)\b", re.I),
    "SIGNAL": re.compile(r"\(c\)|©|\(C\)"),
    "YEAR": re.compile(r"\b\d{4}\b"),
    "EMAIL": re.compile(r"\b[\w.-]+@[\w.-]+.\w+\b"),
    "LICENSE": re.compile(r"\b(license|licensed)\b", re.I),
    "ALL_RIGHTS_RESERVED": re.compile(r"\brights|reserved\b", re.I),
    "OTHER": re.compile(r"\bInc\b")
}

def feature_extraction(sentences):
    vectors = []
    for sentence in sentences:
        counts = {}
        for token in TOKENS:
            counts[token] = 0
            matches = PATTERNS[token].findall(sentence)
            sentence = PATTERNS[token].sub(TOKENS[token], sentence)
            counts[token] += len(matches)
        counts[TOKENS["OTHER"]] = len(sentence.split())
        vector = list(counts.values())
        vectors.append(vector)
    return vectors

X_features = feature_extraction(X)

In [22]:
X_train_feat = feature_extraction(X_train)
X_test_feat = feature_extraction(X_test)
X_1_feat = feature_extraction(X_1)
X_2_feat = feature_extraction(X_2)
X_3_feat = feature_extraction(X_3)
X_feat = feature_extraction(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_feat, y_train)
y_pred = svm.predict_proba(X_test_feat)
y_pred_1 = svm.predict_proba(X_1_feat)
y_pred_2 = svm.predict_proba(X_2_feat)
y_pred_3 = svm.predict_proba(X_3_feat)
y_pred_4 = svm.predict_proba(X_feat)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.977175 | 0.916367 | precision |
| 1    | 0.980392 | 0.609467 | precision |
| 2    | 0.791946 | 0.63     | precision |
| 3    | 0.989977 | 0.816667 | precision |
| 4    | 0.977019 | 0.91321  | precision |
| Mean | 0.943302 | 0.777142 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.971481 | 0.932296 | recall   |
| 1    | 0.858369 | 0.927928 | recall   |
| 2    | 0.76129  | 0.670213 | recall   |
| 3    | 0.974943 | 0.91875  | recall   |
| 4    | 0.970874 | 0.930651 | recall   |
| Mean | 0.907391 | 0.875968 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.97432  | 0.924263 | f1-score |
| 1    | 0.915332 | 0.735714 | f1-score |
| 2    | 0.776316 | 0.649485 | f1-score |
| 3    | 0.982402 | 0.864706 | f1-score |
| 4    | 0.973936 | 0.921848 | f1

In [23]:
get_missclassified_rows(X, y, y_pred_4_classification)

[(1, 'Copyright: Fossology contributors License: GPL-2.0-only'),
 (1,
  'copyrights Remove OpenSSL dependency and use `libgcrypt` Removal of redundant MD5 checksum from `licenseRef.json`'),
 (0,
 (0, 'Copyright (C) <year> <name of author>'),
 (1,
  'copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally.'),
 (0, 'Copyright (c) <year> <owner>.'),
 (0, 'Copyright (c) <year> <copyright holders>'),
 (0, 'Copyright (c) <YEAR> <COPYRIGHT HOLDERS>'),
 (1,
  'Copyright © [$date-of-software] World Wide Web Consortium, (Massachusetts Institute of Technology, Institut National de Recherche en Informatique et en Automatique, Keio University). All Rights Reserved. http://www.w3.org/Consortium/Legal/" 3. Notice of any changes or modifications to the W3C f'),
 

## Quick test on fossology-provided-2 True Positives

In [26]:
foss_provided_2 = pd.read_csv('../datasets/fossology-provided-2.csv')
foss_provided_2 = foss_provided_2[foss_provided_2['falsePositive'] == 0]
foss_provided_2['copyright'] = foss_provided_2['copyright'].astype(str)
y_pred_foss_provided_2 = svm_best_3[0].predict_proba(svm_best_3[1].transform(preprocess_function(foss_provided_2['copyright'],
                                            replace_dates=True, remove_numbers=True,
                                            replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True)))
y_pred_foss_provided_2 = np.argmax(y_pred_foss_provided_2, axis=1)
z = classification_report(foss_provided_2['falsePositive'], y_pred_foss_provided_2, output_dict=True)
print('Number of missclassifications in class 0: ', z['0.0']['support'] - round(z['0.0']['recall'] * z['0.0']['support']), 'out of a total sample of: ', z['0.0']['support'], ' - about ', round((1 - z['0.0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', z['1.0']['support'] - round(z['1.0']['recall'] * z['1.0']['support']), 'out of a total sample of: ', z['1.0']['support'], ' - about ', round((1 - z['1.0']['recall']) * 100, 2), '% of the class was missclassified')

Number of missclassifications in class 0:  27 out of a total sample of:  5808  - about  0.46 % of the class was missclassified
Number of missclassifications in class 1:  0 out of a total sample of:  0  - about  100.0 % of the class was missclassified


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
get_missclassified_rows(foss_provided_2['copyright'], foss_provided_2['falsePositive'], y_pred_foss_provided_2)

[(1,
  'COPYRIGHT SIGN 00A4;currency;CURRENCY SIGN 0064;d;LATIN SMALL LETTER D 2020;dagger;DAGGER 2021;daggerdbl;DOUBLE DAGGER 010F;dcaron;LATIN SMALL LETTER D WITH CARON 0111;dcroat;LATIN SMALL LETTER D WITH STROKE 00B0;degree;DEGREE SIGN 03B4;delta;GREEK SMALL LETTER DELTA 2666;diamond;BLACK DIAMOND SUIT 00A8;dieresis;DIAERESIS 0385;dieresistonos;GREEK DIALYTIKA TONOS 00F7;divide;DIVISION SIGN 2593;dkshade;DARK SHADE 2584;dnblock;LOWER HALF BLOCK 0024;dollar;DOLLAR SIGN 20AB;dong;DONG SIGN 02D9;dotaccent;DOT ABOVE 0323;dotbelowcomb;COMBINING DOT BELOW 0131;dotlessi;LATIN SMALL LETTER DOTLESS I 22C5;dotmath;DOT OPERATOR 0065;e;LATIN SMALL LETTER E 00E9;eacute;LATIN SMALL LETTER E WITH ACUTE 0115;ebreve;LATIN SMALL LETTER E WITH BREVE 011B;ecaron;LATIN SMALL LETTER E WITH CARON 00EA;ecircumflex;LATIN SMALL LETTER E WITH CIRCUMFLEX 00EB;edieresis;LATIN SMALL LETTER E WITH DIAERESIS 0117;edotaccent;LATIN SMALL LETTER E WITH DOT ABOVE 00E8;egrave;LATIN SMALL LETTER E WITH GRAVE 0038;eight

In [121]:
foss_provided_2 = pd.read_csv('../datasets/fossology-provided-2.csv')
foss_provided_2 = foss_provided_2[foss_provided_2['falsePositive'] == 0]
foss_provided_2['copyright'] = foss_provided_2['copyright'].astype(str)
y_pred_foss_provided_2 = svm_best_3[0].predict_proba(svm_best_3[1].transform(preprocess_function(foss_provided_2['copyright'],
                                            replace_dates=True, remove_numbers=True,
                                            replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True)))
y_pred_foss_provided_2 = [np.argmax(y) if max(y) > 0.9 else 0 for y in y_pred_foss_provided_2]
z = classification_report(foss_provided_2['falsePositive'], y_pred_foss_provided_2, output_dict=True)
print('Number of missclassifications in class 0: ', z['0.0']['support'] - round(z['0.0']['recall'] * z['0.0']['support']), 'out of a total sample of: ', z['0.0']['support'], ' - about ', round((1 - z['0.0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', z['1.0']['support'] - round(z['1.0']['recall'] * z['1.0']['support']), 'out of a total sample of: ', z['1.0']['support'], ' - about ', round((1 - z['1.0']['recall']) * 100, 2), '% of the class was missclassified')

Number of missclassifications in class 0:  21 out of a total sample of:  5808  - about  0.36 % of the class was missclassified
Number of missclassifications in class 1:  0 out of a total sample of:  0  - about  100.0 % of the class was missclassified


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
get_missclassified_rows(foss_provided_2['copyright'], foss_provided_2['falsePositive'], y_pred_foss_provided_2)

[(1,
  'COPYRIGHT SIGN 00A4;currency;CURRENCY SIGN 0064;d;LATIN SMALL LETTER D 2020;dagger;DAGGER 2021;daggerdbl;DOUBLE DAGGER 010F;dcaron;LATIN SMALL LETTER D WITH CARON 0111;dcroat;LATIN SMALL LETTER D WITH STROKE 00B0;degree;DEGREE SIGN 03B4;delta;GREEK SMALL LETTER DELTA 2666;diamond;BLACK DIAMOND SUIT 00A8;dieresis;DIAERESIS 0385;dieresistonos;GREEK DIALYTIKA TONOS 00F7;divide;DIVISION SIGN 2593;dkshade;DARK SHADE 2584;dnblock;LOWER HALF BLOCK 0024;dollar;DOLLAR SIGN 20AB;dong;DONG SIGN 02D9;dotaccent;DOT ABOVE 0323;dotbelowcomb;COMBINING DOT BELOW 0131;dotlessi;LATIN SMALL LETTER DOTLESS I 22C5;dotmath;DOT OPERATOR 0065;e;LATIN SMALL LETTER E 00E9;eacute;LATIN SMALL LETTER E WITH ACUTE 0115;ebreve;LATIN SMALL LETTER E WITH BREVE 011B;ecaron;LATIN SMALL LETTER E WITH CARON 00EA;ecircumflex;LATIN SMALL LETTER E WITH CIRCUMFLEX 00EB;edieresis;LATIN SMALL LETTER E WITH DIAERESIS 0117;edotaccent;LATIN SMALL LETTER E WITH DOT ABOVE 00E8;egrave;LATIN SMALL LETTER E WITH GRAVE 0038;eight

## Test data from a paper

In [82]:
feature_extraction_paper_df = pd.DataFrame(columns=['copyright', 'falsePositive'])
with open('../ML_copyright-master/cr_neg', 'r') as file:
    data = file.readlines() # remvove \n
    data = [line.strip() for line in data] # remove \n
    for line in data:
        feature_extraction_paper_df = pd.concat([feature_extraction_paper_df, 
        pd.DataFrame({'falsePositive': 1, 'copyright': [line]})], ignore_index=True)
with open('../ML_copyright-master/cr_pos', 'r') as file:
    data = file.readlines() # remvove \n
    data = [line.strip() for line in data] # remove \n
    for line in data:
        feature_extraction_paper_df = pd.concat([feature_extraction_paper_df, 
        pd.DataFrame({'falsePositive': 0, 'copyright': [line]})], ignore_index=True)
feature_extraction_paper_df['falsePositive'].value_counts()

0    2146
1     151
Name: falsePositive, dtype: int64

In [84]:
feature_extraction_paper_df_pred = svm_best_3[0].predict_proba(svm_best_3[1].transform(preprocess_function(feature_extraction_paper_df['copyright'],
                                            replace_dates=True, remove_numbers=True,
                                            replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True)))
feature_extraction_paper_df_pred = np.argmax(feature_extraction_paper_df_pred, axis=1)
z = classification_report(feature_extraction_paper_df['falsePositive'].to_list(), feature_extraction_paper_df_pred, output_dict=True)
aggregate_reports([z])
print('Number of missclassifications in class 0: ', z['0']['support'] - round(z['0']['recall'] * z['0']['support']), 'out of a total sample of: ', z['0']['support'], ' - about ', round((1 - z['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', z['1']['support'] - round(z['1']['recall'] * z['1']['support']), 'out of a total sample of: ', z['1']['support'], ' - about ', round((1 - z['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.999068 | 0.986755 | precision |
| Mean | 0.999068 | 0.986755 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.999068 | 0.986755 | recall   |
| Mean | 0.999068 | 0.986755 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.999068 | 0.986755 | f1-score |
| Mean | 0.999068 | 0.986755 | f1-score |
Number of missclassifications in class 0:  2 out of a total sample of:  2146  - about  0.09 % of the class was missclassified
Number of missclassifications in class 1:  2 out of a total sample of:  151  - about  1.32 % of the class was missclassified


In [85]:
get_missclassified_rows(feature_extraction_paper_df['copyright'], feature_extraction_paper_df['falsePositive'], feature_extraction_paper_df_pred)

[(0, 'without fee is hereby granted, provided that the above copyright'),
 (0, '- copyright.xml (   1597 bytes, from 2016-10-29 07:29:22)'),
 (1, 'Copyright Information:'),
 (1, '11-18-14  02.00.36  Updated copyright information.')]

# Language Detection

In [13]:
import spacy
import spacy_fastlang
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('language_detector')



<spacy_fastlang.LanguageDetector at 0x7fa8cf803b50>

In [14]:
preprocessed_X = preprocess_function(X, replace_dates=True, remove_numbers=True,
                                    replace_copyright_symbols=True,
                                    remove_whitespaces=True, lower=True, replace_emails=True,
                                    remove_special_characters=True)
languages = [(nlp(preprocessed_x)._.language, nlp(preprocessed_x)._.language_score) for preprocessed_x in tqdm(preprocessed_X)]

100%|██████████| 21770/21770 [03:38<00:00, 99.70it/s] 


In [15]:
pd_x = pd.DataFrame(X)
pd_x['falsePositive'] = y
pd_x['language'] = 'en'
pd_x['language_score'] = 0
for i in range(len(X)):
    pd_x.loc[i, 'language'] = languages[i][0]
    pd_x.loc[i, 'language_score'] = languages[i][1]

In [16]:
pd_x_not_en = pd_x[pd_x['language'] != 'en']
pd_x_not_en = pd_x_not_en.dropna()
pd_x_not_en['copyright'][0:-1].to_numpy()

array(['© Gaurav Mishra <mishra.gaurav@siemens.com>', '© 2021 Siemens AG',
       '© 2022 Siemens AG', ...,
       '© položky. extensions &lt;prípona&gt;[,&lt;prípona&gt;]... Spracovať iba súbory so zadanými príponami.',
       '© súbory?</translation> message> message> source>The file &apos;%1&apos; does not seem to be related to the file &apos;%2&apos; which is being loaded as well.',
       '© súbory GNU Gettext</translation> message> message> source>GNU Gettext localization template files</source> translation>Súbory s prekladovými šablónami GNU Gettext</translation> message> message> source>Compiled Qt translations</source>'],
      dtype=object)

In [17]:
len(pd_x_not_en)

1478

In [126]:
pd_x_not_en.head()

Unnamed: 0,copyright,falsePositive,language,language_score
13,© Gaurav Mishra <mishra.gaurav@siemens.com>,0.0,it,0.297793
14,© 2021 Siemens AG,0.0,it,0.299393
15,© 2022 Siemens AG,0.0,it,0.299393
23,© 2021 Siemens AG Author: Gaurav Mishra <mishr...,0.0,it,0.314305
27,© 2014-2021 Siemens AG,0.0,it,0.253543


In [125]:
pd_x_not_en['copyright'].to_list()

['© Gaurav Mishra <mishra.gaurav@siemens.com>',
 '© 2021 Siemens AG',
 '© 2022 Siemens AG',
 '© 2021 Siemens AG Author: Gaurav Mishra <mishra.gaurav@siemens.com>',
 '© 2014-2021 Siemens AG',
 'copyright): JSON output 956855f` feat(monk): JSON output 2a397af` feat(nomos): JSON output 5439978` feat(obligations): extend datamodel and obligation management',
 'copyright): New JSON hpp version',
 'copyright): Remove DISABLE_JSON macro',
 '© fabio.huser@siemens.com',
 '© 2016,2022 Siemens AG',
 '© mishra.gaurav@siemens.com',
 'Copyright (c) 1994-2002 World Wide Web Consortium, (Massachusetts Institute of Technology, Institut National de Recherche en Informatique et en Automatique, Keio University). All Rights Reserved. http://www.w3.org/Consortium/Legal/',
 'Copyright (C) 2013-2015 Siemens AG',
 'Copyright (C) 2014-2017 Siemens AG',
 'Copyright (C) 2013-2014 Siemens AG',
 'Copyright (C) 2013-2017 Siemens AG',
 'Copyright (C) 2014, 2018 Siemens AG',
 'Copyright (C) 2014,2019 Siemens AG',
 'Co