# Test out different embeddings 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import scipy

In [2]:
df1 = pd.read_csv('../cleared_datasets/fossology-master.csv')
X = df1["copyright"]
y = df1["falsePositive"]
X = X.drop_duplicates()
y = y[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

test1 = pd.read_csv('../cleared_datasets/kubernetes-master.csv')
X_1 = test1["copyright"]
y_1 = test1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

test2 = pd.read_csv('../cleared_datasets/tensorflow-master.csv')
X_2 = test2["copyright"]
y_2 = test2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

foss_data = pd.read_csv('../Fossology-Provided-Dataset-1.csv')

X_foss = foss_data['scanner_content']
y_foss = foss_data['falsePositive']
X_foss = X_foss.drop_duplicates()
y_foss = y_foss[X_foss.index]

X_all = pd.concat([X_train, X_1, X_2, X_foss])
y_all = pd.concat([y_train, y_1, y_2, y_foss])

X_grid_search = pd.concat([X_train, X_all])
y_grid_search = pd.concat([y_train, y_all])

## Bag of Words 

In [3]:
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_1_bow = vectorizer.transform(X_1)

X_2_bow = vectorizer.transform(X_2)

X_foss_bow = vectorizer.transform(X_foss)

X_all_bow = vectorizer.transform(X_all)

X_grid_search_bow = vectorizer.transform(X_grid_search)

### 1. SVM

In [4]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_foss_bow)
y_pred_4 = svm.predict(X_all_bow)
report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)
print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.98      0.97      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.98      3894
weighted avg       0.98      0.98      0.98      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.99      0.85      0.91       445
           1       0.65      0.96      0.78       132

    accuracy                           0.87       577
   macro avg       0.82      0.90      0.84       577
weighted avg       0.91      0.87      0.88       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       1.00      0.72      0.83       134
           1       0.75      1.00      0.86       115

    accuracy                           0.

In [6]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': [0.01, 0.1, 1, 10]
}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=2, n_jobs=-1)

grid_search.fit(X_grid_search_bow, y_grid_search)

# print the best parameter values and the best cross-validation score
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=  19.7s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=  20.0s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=  20.4s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=  20.6s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=  20.6s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 1.1min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 1.1min
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=  19.3s
[CV] END ....................C=0.1, gamma=0.1, 

In [7]:
# SVM best parameters {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
# Accuracy 0.9944387110355197

svm = SVC(C=100, gamma=0.1, kernel='rbf')
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_foss_bow)
y_pred_4 = svm.predict(X_all_bow)
report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)
print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2870
           1       0.92      0.99      0.95      1024

    accuracy                           0.97      3894
   macro avg       0.96      0.98      0.97      3894
weighted avg       0.98      0.97      0.98      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.99      0.87      0.92       445
           1       0.68      0.97      0.80       132

    accuracy                           0.89       577
   macro avg       0.84      0.92      0.86       577
weighted avg       0.92      0.89      0.90       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       0.99      0.72      0.84       134
           1       0.75      0.99      0.86       115

    accuracy                           0.

### 2. Random Forests

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)

y_pred = rf.predict(X_test_bow)
y_pred_1 = rf.predict(X_1_bow)
y_pred_2 = rf.predict(X_2_bow)
y_pred_3 = rf.predict(X_foss_bow)
y_pred_4 = rf.predict(X_all_bow)

report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)

print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2870
           1       0.94      0.98      0.96      1024

    accuracy                           0.98      3894
   macro avg       0.96      0.98      0.97      3894
weighted avg       0.98      0.98      0.98      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.99      0.84      0.91       445
           1       0.65      0.98      0.78       132

    accuracy                           0.88       577
   macro avg       0.82      0.91      0.85       577
weighted avg       0.92      0.88      0.88       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       1.00      0.70      0.82       134
           1       0.74      1.00      0.85       115

    accuracy                           0.

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, verbose=2, n_jobs=-1)

grid_search.fit(X_grid_search_bow, y_grid_search)

# print the best parameter values and the best cross-validation score
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   1.9s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   2.1s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   2.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   2.5s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   3.0s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=50; total time=   9.9s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=50; total time=  10.5s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=50; total time=  10.4s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=50; total time=  10.7s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=5

In [12]:
# {'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
# 0.9947377093910289

rf = RandomForestClassifier(criterion= 'entropy', max_depth= None, max_features= 'log2', n_estimators= 200)

rf.fit(X_train_bow, y_train)

y_pred = rf.predict(X_test_bow)
y_pred_1 = rf.predict(X_1_bow)
y_pred_2 = rf.predict(X_2_bow)
y_pred_3 = rf.predict(X_foss_bow)
y_pred_4 = rf.predict(X_all_bow)

report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)

print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2870
           1       0.95      0.98      0.97      1024

    accuracy                           0.98      3894
   macro avg       0.97      0.98      0.98      3894
weighted avg       0.98      0.98      0.98      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       1.00      0.86      0.93       445
           1       0.68      0.99      0.81       132

    accuracy                           0.89       577
   macro avg       0.84      0.93      0.87       577
weighted avg       0.93      0.89      0.90       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       1.00      0.77      0.87       134
           1       0.79      1.00      0.88       115

    accuracy                           0.

### 3. Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB


nb = MultinomialNB()

nb.fit(X_train_bow, y_train)

y_pred = nb.predict(X_test_bow)
y_pred_1 = nb.predict(X_1_bow)
y_pred_2 = nb.predict(X_2_bow)
y_pred_3 = nb.predict(X_foss_bow)
y_pred_4 = nb.predict(X_all_bow)

report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)

print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2870
           1       0.92      0.94      0.93      1024

    accuracy                           0.96      3894
   macro avg       0.95      0.96      0.95      3894
weighted avg       0.96      0.96      0.96      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.97      0.78      0.86       445
           1       0.55      0.91      0.69       132

    accuracy                           0.81       577
   macro avg       0.76      0.85      0.78       577
weighted avg       0.87      0.81      0.82       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       0.86      0.80      0.83       134
           1       0.78      0.84      0.81       115

    accuracy                           0.

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [1e-10, 1e-9, 1e-8, 1e-7, 1e-7, 1e-5,
              1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0,
              2.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0],
    'fit_prior': [True, False]
    }

grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, verbose=2, n_jobs=-1)

grid_search.fit(X_grid_search_bow, y_grid_search)

# print the best parameter values and the best cross-validation score
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 38 candidates, totalling 190 fits
[CV] END ........................alpha=1e-10, fit_prior=True; total time=   0.0s
[CV] END ........................alpha=1e-10, fit_prior=True; total time=   0.0s
[CV] END ........................alpha=1e-10, fit_prior=True; total time=   0.0s
[CV] END ........................alpha=1e-10, fit_prior=True; total time=   0.0s
[CV] END ........................alpha=1e-10, fit_prior=True; total time=   0.0s
[CV] END .......................alpha=1e-10, fit_prior=False; total time=   0.0s
[CV] END .......................alpha=1e-10, fit_prior=False; total time=   0.0s
[CV] END .......................alpha=1e-10, fit_prior=False; total time=   0.0s
[CV] END .......................alpha=1e-10, fit_prior=False; total time=   0.0s
[CV] END ........................alpha=1e-09, fit_prior=True; total time=   0.0s
[CV] END .......................alpha=1e-10, fit_prior=False; total time=   0.0s
[CV] END ........................alpha=1e-09, f

In [20]:
# {'alpha': 1e-10, 'fit_prior': False}
# 0.9800880324455674

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha= 0.01, fit_prior= False)

nb.fit(X_train_bow, y_train)

y_pred = nb.predict(X_test_bow)
y_pred_1 = nb.predict(X_1_bow)
y_pred_2 = nb.predict(X_2_bow)
y_pred_3 = nb.predict(X_foss_bow)
y_pred_4 = nb.predict(X_all_bow)

report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)

print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2870
           1       0.93      0.96      0.95      1024

    accuracy                           0.97      3894
   macro avg       0.96      0.97      0.96      3894
weighted avg       0.97      0.97      0.97      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.98      0.78      0.87       445
           1       0.56      0.94      0.70       132

    accuracy                           0.82       577
   macro avg       0.77      0.86      0.78       577
weighted avg       0.88      0.82      0.83       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       0.86      0.81      0.83       134
           1       0.79      0.85      0.82       115

    accuracy                           0.

In [6]:
X_train_bow.shape

(15573, 14641)

### 4. Recurrent Neural Networks

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

input_size = 100 # Set desired input size
hidden_size = 50
num_layers = 2
batch_size = 32 # Set batch size

embedding_size = 14641 # Set embedding size to match number of features in X_train_bow

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, embedding_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Linear(embedding_size, input_size)
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return output, hidden

rnn = RNNModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, embedding_size=embedding_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rnn = rnn.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.01)

# Convert sparse matrix to dense and then to PyTorch tensor
X_train_bow_tensor = torch.from_numpy(X_train_bow.toarray())
y_train_tensor = torch.from_numpy(y_train.to_numpy())

# Move data to device
X_train_bow_tensor = X_train_bow_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

# Convert input data to torch.float32
X_train_bow_tensor = X_train_bow_tensor.float()

# Convert target data to torch.long
y_train_tensor = y_train_tensor.long()

# Create data loader for batching
train_data = TensorDataset(X_train_bow_tensor, y_train_tensor)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

epochs = 10
for epoch in tqdm(range(epochs)):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output, hidden = rnn(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


y_pred = torch.argmax(output, dim= 1)
#y_pred_1, _ = rnn(X_1_bow)
#y_pred_1 = torch.argmax(y_pred_1, dim= 1)
#y_pred_2, _ = rnn(X_2_bow)
#y_pred_2 = torch.argmax(y_pred_2, dim= 1)
#y_pred_3, _ = rnn(X_foss_bow)
#y_pred_3 = torch.argmax(y_pred_3, dim= 1)
#y_pred_4, _ = rnn(X_all_bow)
#y_pred_4 = torch.argmax(y_pred_4, dim= 1)

report = classification_report(y_test, y_pred)
#report_1 = classification_report(y_1, y_pred_1)
#report_2 = classification_report(y_2, y_pred_2)
#report_3 = classification_report(y_foss, y_pred_3)
#report_4 = classification_report(y_all, y_pred_4)

print('Test data split from the training dataset (which is from fossology-master)')
print(report)
#print('Test data from kubernetes-master')
#print(report_1)
#print('Test data from tensorflow-master') 
#print(report_2) 
#print('Test data Provided by Fossology') 
#print(report_3) 
#print('Merging all the test data together')
#print(report_4)

### 5. LSTMS

### BiLateral LSTMS

## TF IDF 

In [3]:
vectorizer = TfidfVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_1_bow = vectorizer.transform(X_1)

X_2_bow = vectorizer.transform(X_2)

X_foss_bow = vectorizer.transform(X_foss)

X_all_bow = vectorizer.transform(X_all)

X_grid_search_bow = vectorizer.transform(X_grid_search)

## 1. SVM

In [4]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_foss_bow)
y_pred_4 = svm.predict(X_all_bow)
report = classification_report(y_test, y_pred)
report_1 = classification_report(y_1, y_pred_1)
report_2 = classification_report(y_2, y_pred_2)
report_3 = classification_report(y_foss, y_pred_3)
report_4 = classification_report(y_all, y_pred_4)
print('Test data split from the training dataset (which is from fossology-master)')
print(report)
print('Test data from kubernetes-master')
print(report_1)
print('Test data from tensorflow-master')
print(report_2)
print('Test data Provided by Fossology')
print(report_3)
print('Merging all the test data together')
print(report_4)

Test data split from the training dataset (which is from fossology-master)
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2870
           1       0.97      0.98      0.97      1024

    accuracy                           0.98      3894
   macro avg       0.98      0.98      0.98      3894
weighted avg       0.98      0.98      0.98      3894

Test data from kubernetes-master
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       445
           1       0.70      0.92      0.80       132

    accuracy                           0.89       577
   macro avg       0.84      0.90      0.86       577
weighted avg       0.91      0.89      0.90       577

Test data from tensorflow-master
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       134
           1       0.89      0.94      0.92       115

    accuracy                           0.

## GloVe

## FastText

## ELMo

## BERT

In [3]:
# Load GloVe embeddings
import numpy as np
def load_glove(file):
    """Load GloVe embeddings from a text file.
    Args:
        file (str): path to the glove file.
    Returns:
        dict: a dictionary mapping words to their vector representations.
    """
    embeddings = {}
    with open(file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove50 = load_glove('../glove.6B/glove.6B.50d.txt')
glove100 = load_glove('../glove.6B/glove.6B.100d.txt')
glove200 = load_glove('../glove.6B/glove.6B.200d.txt')
glove300 = load_glove('../glove.6B/glove.6B.300d.txt')