In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df1 = pd.read_csv('cleared_datasets/fossology-master.csv')
df1.head()

Unnamed: 0,"Copyright Law. Subject to the following terms, Fedora Project grants to\nthe user (\""User\"") a license to this collective work pursuant to the GNU\nGeneral Public License version 2. By downloading, installing or using\nthe Software, User agrees to the terms of this agreement.\n\n1. THE SOFTWARE.",scan_code_copyrights,copyright,falsePositive,important
0,fossology-master/.dockerignore,-,copyright/agent_tests/Unit/test_copyright src/...,1,
1,fossology-master/.dockerignore,-,copyright/VERSION-copyright src/spdx2/agent_te...,1,
2,fossology-master/.dockerignore,-,copyright_list src/cli/fo_folder src/cli/fo_no...,1,
3,fossology-master/.dockerignore,-,copyright/VERSION-keyword,1,
4,fossology-master/.dockerignore,-,copyright/VERSION-ecc,1,


In [3]:
X = df1["copyright"]
y = df1["falsePositive"]

In [4]:
y.value_counts()

0    22929
1    20814
Name: falsePositive, dtype: int64

In [5]:
X = X.drop_duplicates()
y = y[X.index]

In [6]:
y.value_counts()

0    14281
1     5186
Name: falsePositive, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Start by using SVMS

In [9]:
svm = LinearSVC()
svm.fit(X_train_vec, y_train)

In [10]:
y_pred = svm.predict(X_test_vec)

In [11]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2870
           1       0.97      0.98      0.97      1024

    accuracy                           0.99      3894
   macro avg       0.98      0.98      0.98      3894
weighted avg       0.99      0.99      0.99      3894



In [12]:
misclassified = X_test.loc[y_test != y_pred]
len(misclassified)

55

In [13]:
print(misclassified)

2939         Copyright (c) 199\tAdobe Multiple Master font
36520    copyright-software-19980720">previous version<...
16332    (C) CHARGE ANY FEE IN CONNECTION WITH THE SOFT...
35107    copyrighted software distributed under the ter...
16426    © d'usage, de copie, de diffusion, de transfor...
16355                   Copyright Act of 1998 ("DMCA") is:
9137     Copyright (C) &lt;year&gt; &lt;name of author&...
7590                                  (c) Representations.
34423    Copyright (C) Universite de Strasbourg Distrib...
27436    COPYRIGHTS. Title to all copies of the\nFirmwa...
28295    copyright",1); scannerTest(sc, testContent, "e...
33405                     Copyright Notre Dame, some don't
35328    Copyright: X License Group: System Environment...
16684    copyrights held by third parties, and the soft...
32273    copyright (copyright_pk, agent_fk, pfile_fk, c...
27416    Copyright 1988,1990,1993 by Paul Vixie\n\nAll ...
27504    (c) the European Community 2007\n\nThis Europe.

### test the svm on a new dataframe kubernetes-master

In [14]:
df2 = pd.read_csv('cleared_datasets/kubernetes-master.csv')
df2.head()

Unnamed: 0,file_path,copyright,falsePositive
0,kubernetes-master/LICENSE,copyright owner or by an individual or Legal E...,1
1,kubernetes-master/staging/src/k8s.io/apiextens...,copyright owner or by an individual or Legal E...,1
2,kubernetes-master/staging/src/k8s.io/apimachin...,copyright owner or by an individual or Legal E...,1
3,kubernetes-master/staging/src/k8s.io/kube-sche...,copyright owner or by an individual or Legal E...,1
4,kubernetes-master/staging/src/k8s.io/cluster-b...,copyright owner or by an individual or Legal E...,1


In [15]:
X_test_1 = df2['copyright']
y_test_1 = df2['falsePositive']

In [16]:
y_test_1.value_counts()

0    14865
1     1632
Name: falsePositive, dtype: int64

In [17]:
X_test_1 = X_test_1.drop_duplicates()
y_test_1 = y_test_1[X_test_1.index]

In [18]:
y_test_1.value_counts()

0    445
1    132
Name: falsePositive, dtype: int64

In [19]:
X_test_1_vec = vectorizer.transform(X_test_1)

In [20]:
y_pred_1 = svm.predict(X_test_1_vec)

In [21]:
report = classification_report(y_test_1, y_pred_1)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.90      0.94       445
           1       0.73      0.92      0.82       132

    accuracy                           0.91       577
   macro avg       0.86      0.91      0.88       577
weighted avg       0.92      0.91      0.91       577



In [22]:
misclassified_1 = X_test_1.loc[y_test_1 != y_pred_1]
len(misclassified_1)

54

In [23]:
print(misclassified_1)

12056    Copyright YEAR The Kubernetes sample-apiserver...
12557    copyright notice, this list of conditions and ...
12823               Copyright YEAR The Kubernetes Authors.
14284    Copyright YEAR The Kubernetes sample-controlle...
14314                               �� especificada ou \n"
14315    �� suportada, converte para a ��ltima vers�� d...
14316    �� no formato YAML. Pode ser utilizadoa op����...
14319    �� interrompido se os pods n��o s��o gerenciad...
14324                                      �� opcional.\n"
14325                                        �� mut��vel."
14327                            �� verdadeira " quando\n"
14328                         �� enviado para o servidor."
14329    �� configurada v��rias vezes, apenas o ��ltimo...
14330      �� especificado, �� " ignorado caso contr��rio"
14331                                       �� verdadeiro"
14332    �� verdadeiro, esta tamb��m �� a " porta utili...
14333                                         �� criado,

### test the svm on a new dataframe tensorflow-master

In [24]:
df3 = pd.read_csv('cleared_datasets/tensorflow-master.csv')
df3.head()

Unnamed: 0,file_path,copyright,falsePositive
0,tensorflow-master/third_party/py/ml_dtypes/LIC...,copyright owner or by an individual or Legal E...,1
1,tensorflow-master/tensorflow/tsl/mkl/LICENSE,copyright owner or by an individual or Legal E...,1
2,tensorflow-master/tensorflow/tsl/mkl/MKL_LICENSE,copyright owner or by an individual or Legal E...,1
3,tensorflow-master/LICENSE,copyright owner or by an individual or Legal E...,1
4,tensorflow-master/third_party/gpus/cuda/LICENSE,copyright owner or by an individual or Legal E...,1


In [25]:
X_test_2 = df3['copyright']
y_test_2 = df3['falsePositive']

In [26]:
y_test_2.value_counts()

0    25786
1      156
Name: falsePositive, dtype: int64

In [27]:
X_test_2 = X_test_2.drop_duplicates()
y_test_2 = y_test_2[X_test_2.index]

In [28]:
y_test_2.value_counts()

0    134
1    115
Name: falsePositive, dtype: int64

In [29]:
X_test_2_vec = vectorizer.transform(X_test_2)
y_pred_2 = svm.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       134
           1       0.87      0.90      0.89       115

    accuracy                           0.90       249
   macro avg       0.89      0.90      0.90       249
weighted avg       0.90      0.90      0.90       249



In [30]:
### Try to train on all the data
X = pd.concat([X_train, X_test, X_test_1, X_test_2])
y = pd.concat([y_train, y_test, y_test_1, y_test_2])

X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_all_train_vec = vectorizer.fit_transform(X_all_train)
X_all_test_vec = vectorizer.transform(X_all_test)

svm_all = LinearSVC()
svm_all.fit(X_all_train_vec, y_all_train)

y_all_pred = svm_all.predict(X_all_test_vec)

report = classification_report(y_all_test, y_all_pred)
print(report)

misclassified_all = X_all_test.loc[y_all_test != y_all_pred]

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2975
           1       0.96      0.98      0.97      1084

    accuracy                           0.98      4059
   macro avg       0.97      0.98      0.98      4059
weighted avg       0.98      0.98      0.98      4059



In [31]:
len(misclassified_all)

75

In [32]:
for misclassification in misclassified_all:
    print(misclassification)

copyrights, patents, trade secrets or any other intellectual property of Licensor except as expressly stated herein. No patent license is granted to make, use, sell or offer to sell embodiments of any patent claims other than the Licensed Claims defined in Section 2. No right is granted to the trade
Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and
(C) THE FRAMEWORX COMPANY 2003 http://opensource.org/licenses/frameworx.php', NULL, NULL, NULL, 'Frameworx Open License 1.0', NULL, NULL, NULL, '', NULL, false, false, false, '1e8bdb53db662b8ba76df594951ff287', 1, NULL); INSERT INTO license_ref VALUES (227, 'gSOAP-1.3b', 'gSOAP Public License Ve
(c) 1T3XT BVBA"/> meta name="CreationDate" content=""/> head> body> pre> Example PDF file that contains a form with a signature field.
copyrighted work whose copyright is held by Artifex Software Inc., located in San Rafael California and artofcode LLC, located in Benicia, California (the "Licensor"). Please note that 

### Testing out Random Forest

In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

df1 = pd.read_csv('cleared_datasets/fossology-master.csv')
df1.head()
X = df1["copyright"]
y = df1["falsePositive"]
X = X.drop_duplicates()
y = y[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
rf = RandomForestClassifier()
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894



In [34]:
df2 = pd.read_csv('cleared_datasets/kubernetes-master.csv')
X_test_1 = df2['copyright']
y_test_1 = df2['falsePositive']
X_test_1.drop_duplicates()
y_test_1 = y_test_1[X_test_1.index]
X_test_1_vec = vectorizer.transform(X_test_1)
y_pred_1 = rf.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14865
           1       0.86      0.99      0.92      1632

    accuracy                           0.98     16497
   macro avg       0.93      0.99      0.96     16497
weighted avg       0.99      0.98      0.98     16497



In [35]:
df3 = pd.read_csv('cleared_datasets/tensorflow-master.csv')
X_test_2 = df3['copyright']
y_test_2 = df3['falsePositive']
X_test_2.drop_duplicates()
y_test_2 = y_test_2[X_test_2.index]
X_test_2_vec = vectorizer.transform(X_test_2)
y_pred_2 = rf.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.77      1.00      0.87       156

    accuracy                           1.00     25942
   macro avg       0.89      1.00      0.94     25942
weighted avg       1.00      1.00      1.00     25942



In [36]:
rf6 = RandomForestClassifier(100)
rf6.fit(X_train_vec, y_train)
y_pred = rf6.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)
y_pred_1 = rf6.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)
y_pred_2 = rf6.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

              precision    recall  f1-score   support

           0       1.00      0.79      0.88     14865
           1       0.35      1.00      0.51      1632

    accuracy                           0.81     16497
   macro avg       0.67      0.89      0.70     16497
weighted avg       0.93      0.81      0.85     16497

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.60      1.00      0.75       156

    accuracy                           1.00     25942
   macro avg       0.80      1.00      0.88     25942
weighted avg       1.00      1.00      1.00     25942



In [37]:
misclassified_all = X_test.loc[y_test != y_pred]

In [38]:
for m in misclassified_all:
    print(m)

Copyright (C) 2017 Bundesamt für Sicherheit in der Informationstechnik
(C) CHARGE ANY FEE IN CONNECTION WITH THE SOFTWARE. SUBJECT TO THESE LIMITATIONS, YOU MAY MAKE COPIES AND DERIVATIVE WORKS OF THE SOFTWARE AND DISTRIBUTE SUCH COPIES TO OTHER PERSONS PROVIDED THAT SUCH COPIES AND RELATED DISTRIBUTION ARE ACCOMPANIED BY HP'S COPYRIGHT NOTICE AND THIS AGREEMENT AND A
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>\n\nEveryone is permitted to copy and distribute verbatim or modified copies of this license document, and changing it is allowed as long as the name is changed.\n\nDO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE\nTERMS AND CONDITIONS FOR COPYING, DISTRIBU
copyrighted software distributed under the terms of the GNU General Public License (hereinafter the "GPL").
copyright;ojo" \ S. -B./build -G Ninja \
Copyright (C) Universite de Strasbourg Distributed under the terms of the CeCILL-B license, as published by the CEA-CNRS-INRIA. Refer to the LICENSE file or to http://www.cecill.i

In [39]:
rf2 = RandomForestClassifier(200)
rf2.fit(X_train_vec, y_train)
y_pred = rf2.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)
y_pred_1 = rf2.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)
y_pred_2 = rf2.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14865
           1       0.88      1.00      0.93      1632

    accuracy                           0.99     16497
   macro avg       0.94      0.99      0.96     16497
weighted avg       0.99      0.99      0.99     16497

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.81      1.00      0.90       156

    accuracy                           1.00     25942
   macro avg       0.91      1.00      0.95     25942
weighted avg       1.00      1.00      1.00     25942



In [40]:
rf3 = RandomForestClassifier(300)
rf3.fit(X_train_vec, y_train)
y_pred = rf3.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)
y_pred_1 = rf3.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)
y_pred_2 = rf3.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14865
           1       0.88      0.99      0.93      1632

    accuracy                           0.99     16497
   macro avg       0.94      0.99      0.96     16497
weighted avg       0.99      0.99      0.99     16497

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.78      1.00      0.87       156

    accuracy                           1.00     25942
   macro avg       0.89      1.00      0.94     25942
weighted avg       1.00      1.00      1.00     25942



In [41]:
rf4 = RandomForestClassifier(500)
rf4.fit(X_train_vec, y_train)
y_pred = rf4.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)
y_pred_1 = rf4.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)
y_pred_2 = rf4.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14865
           1       0.87      0.99      0.93      1632

    accuracy                           0.98     16497
   macro avg       0.93      0.99      0.96     16497
weighted avg       0.99      0.98      0.99     16497

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.77      1.00      0.87       156

    accuracy                           1.00     25942
   macro avg       0.89      1.00      0.94     25942
weighted avg       1.00      1.00      1.00     25942



In [42]:
rf5 = RandomForestClassifier(1000)
rf5.fit(X_train_vec, y_train)
y_pred = rf5.predict(X_test_vec)
report = classification_report(y_test, y_pred)
print(report)
y_pred_1 = rf5.predict(X_test_1_vec)
report = classification_report(y_test_1, y_pred_1)
print(report)
y_pred_2 = rf5.predict(X_test_2_vec)
report = classification_report(y_test_2, y_pred_2)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.97      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     14865
           1       0.88      0.99      0.93      1632

    accuracy                           0.99     16497
   macro avg       0.94      0.99      0.96     16497
weighted avg       0.99      0.99      0.99     16497

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25786
           1       0.71      1.00      0.83       156

    accuracy                           1.00     25942
   macro avg       0.86      1.00      0.92     25942
weighted avg       1.00      1.00      1.00     25942



### Test out Naive Bayes

In [44]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_all_train_vec, y_all_train)

y_all_pred = nb.predict(X_all_test_vec)

report = classification_report(y_all_test, y_all_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2975
           1       0.92      0.93      0.92      1084

    accuracy                           0.96      4059
   macro avg       0.95      0.95      0.95      4059
weighted avg       0.96      0.96      0.96      4059



In [70]:
import numpy as np

# make the naive bayes classifier even more tight
y_all_pred = nb.predict_proba(X_all_test_vec)
y_all_pred_final = []
for i in range(len(y_all_pred)):
    if max(y_all_pred[i]) < 0.8:
        y_all_pred_final.append(2)
    else:
        y_all_pred_final.append(0 if y_all_pred[i][0] > y_all_pred[i][1] else 1)

# get the counts of the different classes
def get_counts(y_pred):
    counts = {}
    for i in y_pred:
        if i not in counts:
            counts[i] = 1
        else:
            counts[i] += 1
    return counts

get_counts(y_all_pred_final)

{0: 2842, 1: 922, 2: 295}

In [86]:
indices = np.array(y_all_pred_final)
indices = np.where(indices == 2)[0]
np.array(X_all_test)[indices]

array(['copyright notice;',
       'copyright|\\<c\\>[^+:]|&copy) =YEAR= =SOME= catharon productions"',
       '�� SA����Le��T��/��T���� SA����L\x03������i����@����L\x03��\x062',
       'copyright|\\<c\\>[^+:]|&copy) zeus technology limited =YEAR="',
       'copyright|\\<c\\>[^+:]|&copy) =YEAR= =SOME= sleepycat software"',
       "(c) to Red Hat''s knowledge, Red Hat branded Software does not, at the time of delivery to Client, include malicious or hidden mechanisms or code for the purpose of damaging or corrupting the Software.",
       'Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and',
       'Copyright License',
       "�� J��T����R7��T\x06o��������Jvow\x1er\ne��m��HA\x04D��������I��t��/����\x0c����Tt|��p\x07��������d\x05V��#��\x7f\x07������p����GM��4B����\x03��yD���� ����i��v\x19ST����Me����������\x13��S��q\x1cO��Gm[qN��mZ����Q+������eQ+|\x12.����5��\x19��8[\x160������Z��L��w��%��\x08��Ll<\x1aF\x1d��\x19Rbf��\x1br��b&9������Z[V��', ����>6��b0F��[f�

In [71]:
print(classification_report(y_all_test, y_all_pred_final))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97      2975
           1       0.95      0.81      0.87      1084
           2       0.00      0.00      0.00         0

    accuracy                           0.91      4059
   macro avg       0.65      0.58      0.61      4059
weighted avg       0.98      0.91      0.94      4059



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# try only fossology and test on kubernetes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

y_pred = nb.predict(X_test_vec)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2870
           1       0.92      0.95      0.93      1024

    accuracy                           0.96      3894
   macro avg       0.95      0.96      0.95      3894
weighted avg       0.97      0.96      0.96      3894



### USE RNNs

In [None]:
# Load GloVe embeddings
import numpy as np
def load_glove(file):
    """Load GloVe embeddings from a text file.
    Args:
        file (str): path to the glove file.
    Returns:
        dict: a dictionary mapping words to their vector representations.
    """
    embeddings = {}
    with open(file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove = load_glove('glove.6B.300d.txt')


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 64
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_EPOCHS = 10
LR = 0.001

TEXT = Field(tokenize="revtok", lower=True, include_lengths=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float)

fields = [("copyright", TEXT), ("falsePositive", LABEL)]

df1 = pd.read_csv("cleared_datasets/fossology-master.csv")
df1.to_csv("cleared_datasets/fossology-master.csv", index=False) 
dataset = TabularDataset(path="cleared_datasets/fossology-master.csv", format="csv", fields=fields, skip_header=True)

train_dataset, test_dataset = dataset.split(split_ratio=0.8)

TEXT.build_vocab(train_dataset, max_size=10000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)

train_iterator, test_iterator = BucketIterator.splits((train_dataset, test_dataset), batch_size=BATCH_SIZE, sort_within_batch=True, device=device)

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim) 
        self.rnn = nn.RNN(embedding_dim, hidden_dim) 
        self.fc = nn.Linear(hidden_dim, output_dim) 

    def forward(self, text, text_lengths):
        embedded = self.embedding(text) 
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths) 
        packed_output, hidden = self.rnn(packed_embedded) 
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) 
        hidden = hidden.squeeze(0) 
        logits = self.fc(hidden) 
        return logits

input_dim = len(TEXT.vocab) 
model = RNN(input_dim, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device) 

pretrained_embeddings = TEXT.vocab.vectors 
model.embedding.weight.data.copy_(pretrained_embeddings) 

criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=LR) 

def get_accuracy(y_pred, y_true):
    y_pred = torch.argmax(y_pred, dim=1) 
    correct = (y_pred == y_true).float() 
    acc = correct.sum() / len(correct) 
    return acc

def train(model, iterator, criterion, optimizer):
    epoch_loss = 0 
    epoch_acc = 0 
    model.train() 
    for batch in iterator: 
        text, text_lengths = batch.copyright 
        label = batch.falsePositive 
        optimizer.zero_grad() 
        logits = model(text, text_lengths) 
        loss = criterion(logits, label.long()) 
        acc = get_accuracy(logits, label) 
        loss.backward() 
        optimizer.step() 
        epoch_loss += loss.item() 
        epoch_acc += acc.item() 
    return epoch_loss / len(iterator), epoch_acc / len(iterator) 

def evaluate(model, iterator, criterion):
    epoch_loss = 0 
    epoch_acc = 0 
    model.eval() 
    with torch.no_grad(): 
        for batch in iterator: 
            text, text_lengths = batch.copyright 
            label = batch.falsePositive 
            logits = model(text, text_lengths) 
            loss = criterion(logits, label.long()) 
            acc = get_accuracy(logits, label) 
            epoch_loss += loss.item() 
            epoch_acc += acc.item() 
    return epoch_loss / len(iterator), epoch_acc / len(iterator) 

for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train(model, train_iterator, criterion, optimizer) 
    test_loss, test_acc = evaluate(model, test_iterator, criterion) 
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%") 

y_true = [] 
y_pred = [] 
model.eval() 
with torch.no_grad(): 
    for batch in test_iterator: 
        text, text_lengths = batch.copyright 
        label = batch.falsePositive 
        logits = model(text, text_lengths) 
        pred = torch.argmax(logits, dim=1) 
        y_true.extend(label.tolist()) 
        y_pred.extend(pred.tolist()) 

report = classification_report(y_true, y_pred)
print(report)

### Test out different vectorizer (Glove, FastText & Word2Vec) 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from gensim.models import fasttext

fasttext.util.download_model('en', if_exists='ignore')

# Load the data
df1 = pd.read_csv('cleared_datasets/fossology-master.csv')
df1.head()

# Get the inputs and labels
X = df1["copyright"]
y = df1["falsePositive"]

# Drop duplicates
X = X.drop_duplicates()
y = y[X.index]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the FastText model
model = fasttext.load_facebook_vectors('cc.en.300.bin')

# Define a function to create document vectors
def create_doc_vecs(texts, model):
    # Initialize an empty list to store the vectors
    vectors = []
    # Loop through the texts
    for text in texts:
        # Split the text into words
        words = text.split()
        # Initialize an empty array to store the word vectors
        word_vecs = np.zeros((len(words), model.vector_size))
        # Loop through the words
        for i, word in enumerate(words):
            # Get the word vector from the model
            word_vec = model[word]
            # Store it in the array
            word_vecs[i] = word_vec
        # Average the word vectors to get the document vector
        doc_vec = np.mean(word_vecs, axis=0)
        # Append it to the list
        vectors.append(doc_vec)
    # Convert the list to a numpy array
    vectors = np.array(vectors)
    # Return the array of document vectors
    return vectors

# Create document vectors for train and test sets
X_train_vec = create_doc_vecs(X_train, model)
X_test_vec = create_doc_vecs(X_test, model)

# Train a linear SVM classifier
svm = LinearSVC()
svm.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = svm.predict(X_test_vec)

# Print the classification report
report = classification_report(y_test, y_pred)
print(report)

### Testing out DistilBert

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from tqdm import tqdm

In [2]:
df1 = pd.read_csv('cleared_datasets/fossology-master.csv')
df1.head()
X = df1["copyright"]
y = df1["falsePositive"]
X = X.drop_duplicates()
y = y[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [9]:
train_inputs = torch.tensor(train_encodings['input_ids'])
train_attention_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train.tolist())
test_inputs = torch.tensor(test_encodings['input_ids'])
test_attention_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(y_test.tolist())

In [10]:
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_inputs, train_attention_masks, train_labels)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = torch.utils.data.TensorDataset(test_inputs, test_attention_masks, test_labels)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [12]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

100%|██████████| 487/487 [10:03<00:00,  1.24s/it]


Epoch 1, Loss: 0.0854413020070757


  4%|▍         | 19/487 [00:28<11:44,  1.51s/it]


KeyboardInterrupt: 

In [None]:
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        preds = torch.argmax(logits, dim=1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

In [None]:
report = classification_report(y_true, y_pred)
print(report)