In [1]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
import string
import scipy

In [2]:
data_0 = pd.read_csv('../datasets/fossology-master.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../datasets/kubernetes-master.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../datasets/tensorflow-master.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../datasets/Fossology-Provided-Dataset-1.csv')

X_3 = data_3['scanner_content']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X_0, y_0, test_size=0.2, random_state=42)

Class 0 Percentage:  0.7385852090032154
Class 1 Percentage:  0.26141479099678455


In [3]:
def aggregate_reports(reports, print_aggregates=True):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    if print_aggregates:
        print("## Precision")
        print(dfs[0].to_markdown())
        print("## Recall")
        print(dfs[1].to_markdown())
        print("## F1-score")
        print(dfs[2].to_markdown())
    else:
        return dfs[0], dfs[1], dfs[2]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)

In [5]:
input_size = 100 # Set desired input size
hidden_size = 50
num_layers = 2
batch_size = 32 # Set batch size

embedding_size = 14641 # Set embedding size to match number of features in X_train_tfidf

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, embedding_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Linear(embedding_size, input_size)
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return output, hidden

rnn = RNNModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, embedding_size=embedding_size)

device = 'cpu' #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rnn = rnn.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.01)

# Convert sparse matrix to dense and then to PyTorch tensor
X_train_tfidf_tensor = torch.from_numpy(X_train_tfidf.toarray())
y_train_tensor = torch.from_numpy(y_train.to_numpy())

# Move data to device
X_train_tfidf_tensor = X_train_tfidf_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

# Convert input data to torch.float32
X_train_tfidf_tensor = X_train_tfidf_tensor.float()

# Convert target data to torch.long
y_train_tensor = y_train_tensor.long()

# Create data loader for batching
train_data = TensorDataset(X_train_tfidf_tensor, y_train_tensor)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

epochs = 3
for epoch in tqdm(range(epochs)):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output, hidden = rnn(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

 33%|███▎      | 1/3 [00:06<00:13,  6.74s/it]

Epoch 1, Loss: 2.139624834060669


 67%|██████▋   | 2/3 [00:13<00:06,  6.65s/it]

Epoch 2, Loss: 2.1396169662475586


100%|██████████| 3/3 [00:19<00:00,  6.61s/it]

Epoch 3, Loss: 2.1396145820617676





In [None]:
reports = []
datasets = [(X_test_tfidf, y_test), (X_1_tfidf, y_1), (X_2_tfidf, y_2), (X_3_tfidf, y_3), (X_tfidf, y)]

with torch.no_grad():
    for dataset in datasets:
        X_temp_tfidf_tensor = torch.from_numpy(dataset[0].toarray())
        y_temp_tensor =  torch.from_numpy(dataset[1].to_numpy())
        X_temp_tfidf_tensor = X_temp_tfidf_tensor.to(device)
        X_temp_tfidf_tensor = X_temp_tfidf_tensor.float()
        y_temp_tensor = y_temp_tensor.to(device)
        y_temp_tensor = y_temp_tensor.long()
        temp_data = TensorDataset(X_temp_tfidf_tensor, y_temp_tensor)
        temp_loader = DataLoader(temp_data, shuffle=True, batch_size=batch_size)
        y_pred = []
        for inputs, labels in tqdm(temp_loader):
            temp, _ = rnn(inputs)
            y_pred.append(temp)
        y_pred = torch.cat(y_pred, dim=0)
        y_pred = torch.argmax(y_pred, dim= 1)
        y_pred = y_pred.to('cpu')
        reports.append(classification_report(dataset[1], y_pred))
        #del X_temp_tfidf_tensor
        #del y_temp_tensor
        #del temp_data
        #del temp_loader
        #del y_pred
        #torch.cuda.empty_cache()

print(aggregate_reports(reports))
print('Number of missclassifications in class 0: ', reports[4]['0']['support'] - round(reports[4]['0']['recall'] * reports[4]['0']['support']), 'out of a total sample of: ', reports[4]['0']['support'], ' - about ', round((1 - reports[4]['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', reports[4]['1']['support'] - round(reports[4]['1']['recall'] * reports[4]['1']['support']), 'out of a total sample of: ', reports[4]['1']['support'], ' - about ', round((1 - reports[4]['1']['recall']) * 100, 2), '% of the class was missclassified')

In [48]:
print(reports[0])

              precision    recall  f1-score   support

           0       0.74      1.00      0.85      2870
           1       0.00      0.00      0.00      1024

    accuracy                           0.74      3894
   macro avg       0.37      0.50      0.42      3894
weighted avg       0.54      0.74      0.63      3894



In [5]:
from gensim.models.fasttext import FastText

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train)

model.train(X_train, total_examples=len(X_train), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X]

In [18]:
input_size = 500 # Set desired input size
hidden_size = 50
num_layers = 2
batch_size = 32 # Set batch size

embedding_size = 500 # Set embedding size to match number of features in X_train_ft

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, embedding_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Linear(embedding_size, input_size)
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return output, hidden

rnn = RNNModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, embedding_size=embedding_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rnn = rnn.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=1e-5)

# Convert sparse matrix to dense and then to PyTorch tensor
X_train_ft_tensor = torch.from_numpy(np.array(X_train_ft))
y_train_tensor = torch.from_numpy(y_train.to_numpy())

# Move data to device
X_train_ft_tensor = X_train_ft_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

# Convert input data to torch.float32
X_train_ft_tensor = X_train_ft_tensor.float()

# Convert target data to torch.long
y_train_tensor = y_train_tensor.long()

# Create data loader for batching
train_data = TensorDataset(X_train_ft_tensor, y_train_tensor)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

epochs = 20
for epoch in tqdm(range(epochs)):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output, hidden = rnn(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:17,  1.10it/s]

Epoch 1, Loss: 3.147402048110962


 10%|█         | 2/20 [00:01<00:15,  1.15it/s]

Epoch 2, Loss: 2.827108383178711


 15%|█▌        | 3/20 [00:02<00:14,  1.19it/s]

Epoch 3, Loss: 2.5307514667510986


 20%|██        | 4/20 [00:03<00:13,  1.20it/s]

Epoch 4, Loss: 2.3667521476745605


 25%|██▌       | 5/20 [00:04<00:12,  1.21it/s]

Epoch 5, Loss: 2.2474281787872314


 30%|███       | 6/20 [00:04<00:11,  1.22it/s]

Epoch 6, Loss: 2.2089412212371826


 35%|███▌      | 7/20 [00:05<00:10,  1.23it/s]

Epoch 7, Loss: 2.1882436275482178


 40%|████      | 8/20 [00:06<00:09,  1.23it/s]

Epoch 8, Loss: 2.176600933074951


 45%|████▌     | 9/20 [00:07<00:08,  1.24it/s]

Epoch 9, Loss: 2.165022134780884


 50%|█████     | 10/20 [00:08<00:08,  1.25it/s]

Epoch 10, Loss: 2.1600875854492188


 55%|█████▌    | 11/20 [00:08<00:07,  1.25it/s]

Epoch 11, Loss: 2.1576340198516846


 60%|██████    | 12/20 [00:09<00:06,  1.25it/s]

Epoch 12, Loss: 2.1513583660125732


 65%|██████▌   | 13/20 [00:10<00:05,  1.27it/s]

Epoch 13, Loss: 2.148862838745117


 70%|███████   | 14/20 [00:11<00:04,  1.25it/s]

Epoch 14, Loss: 2.14841890335083


 75%|███████▌  | 15/20 [00:12<00:04,  1.24it/s]

Epoch 15, Loss: 2.1457128524780273


 80%|████████  | 16/20 [00:13<00:03,  1.24it/s]

Epoch 16, Loss: 2.1488726139068604


 85%|████████▌ | 17/20 [00:13<00:02,  1.26it/s]

Epoch 17, Loss: 2.1432182788848877


 90%|█████████ | 18/20 [00:14<00:01,  1.25it/s]

Epoch 18, Loss: 2.1429619789123535


 95%|█████████▌| 19/20 [00:15<00:00,  1.25it/s]

Epoch 19, Loss: 2.1421453952789307


100%|██████████| 20/20 [00:16<00:00,  1.24it/s]

Epoch 20, Loss: 2.1418404579162598





In [20]:
reports = []
datasets = [(X_test_ft, y_test), (X_1_ft, y_1), (X_2_ft, y_2), (X_3_ft, y_3), (X_ft, y)]

with torch.no_grad():
    for dataset in datasets:
        X_temp_tfidf_tensor = torch.from_numpy(np.array(dataset[0]))
        y_temp_tensor =  torch.from_numpy(dataset[1].to_numpy())
        X_temp_tfidf_tensor = X_temp_tfidf_tensor.to(device)
        X_temp_tfidf_tensor = X_temp_tfidf_tensor.float()
        y_temp_tensor = y_temp_tensor.to(device)
        y_temp_tensor = y_temp_tensor.long()
        temp_data = TensorDataset(X_temp_tfidf_tensor, y_temp_tensor)
        temp_loader = DataLoader(temp_data, shuffle=True, batch_size=batch_size)
        y_pred = []
        for inputs, labels in tqdm(temp_loader):
            temp, _ = rnn(inputs)
            y_pred.append(temp)
        y_pred = torch.cat(y_pred, dim=0)
        y_pred = torch.argmax(y_pred, dim= 1)
        y_pred = y_pred.to('cpu')
        reports.append(classification_report(dataset[1], y_pred))
        #del X_temp_tfidf_tensor
        #del y_temp_tensor
        #del temp_data
        #del temp_loader
        #del y_pred
        #torch.cuda.empty_cache()

print(aggregate_reports(reports))
print('Number of missclassifications in class 0: ', reports[4]['0']['support'] - round(reports[4]['0']['recall'] * reports[4]['0']['support']), 'out of a total sample of: ', reports[4]['0']['support'], ' - about ', round((1 - reports[4]['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', reports[4]['1']['support'] - round(reports[4]['1']['recall'] * reports[4]['1']['support']), 'out of a total sample of: ', reports[4]['1']['support'], ' - about ', round((1 - reports[4]['1']['recall']) * 100, 2), '% of the class was missclassified')

  0%|          | 0/122 [00:00<?, ?it/s]

100%|██████████| 122/122 [00:00<00:00, 1746.23it/s]
100%|██████████| 19/19 [00:00<00:00, 1285.04it/s]
100%|██████████| 8/8 [00:00<00:00, 1324.74it/s]
100%|██████████| 47/47 [00:00<00:00, 1692.28it/s]
100%|██████████| 681/681 [00:00<00:00, 1850.82it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TypeError: string indices must be integers

In [21]:
print(reports[0])

              precision    recall  f1-score   support

           0       0.74      0.97      0.84      2870
           1       0.23      0.03      0.05      1024

    accuracy                           0.72      3894
   macro avg       0.48      0.50      0.44      3894
weighted avg       0.60      0.72      0.63      3894

