In [42]:
# Import modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import os
import re
from tqdm import tqdm
import numpy as np
import nltk

In [2]:
def get_file_paths(dir):
    file_paths = []
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        if os.path.isfile(file_path):
            file_paths.append(file_path)
        elif os.path.isdir(file_path):
            file_paths.extend(get_file_paths(file_path))
    return file_paths

In [3]:
def get_file_text(file_path):
    with open(file_path, "r") as f:
        file_text = f.read()
    if not os.path.splitext(file_path)[1]:
        return file_text, 'NO_EXTENSION' 
    else:
        return file_text, os.path.splitext(file_path)[1][1:]

In [4]:
def optimize_file_text(file_text, extension):
    file_text = re.sub("\s+", " ", file_text)
    file_text = file_text.strip()
    date_pattern = r'\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}\s\w+\s\d{4}'
    file_text =re.sub(date_pattern, 'DATE', file_text)
    year_pattern = r'\d{4}'
    file_text =re.sub(year_pattern, 'YEAR', file_text)

    single_comment_symbols = {
        'py': ['#'],
        'c': ['\/\/'],
        'cc': ['\/\/'],
        'cpp': ['\/\/'],
        'pl': ['#'],
        'h': ['\/\/'],
        'php': ['\/\/', '#'],
        'js': ['\/\/'],
        'java': ['\/\/'],
        'cs': ['\/\/'],
        'rb': ['#'],
        'pl': ['#'],
        'sql': ['--'],
        'js': ['\/\/'],
        'json': ['\/\/'],
        'RC': ['\/\/'],
        'postinst': ['#'],
        'ixx': ['\/\/'],
        'ts': ['\/\/'],
        'ci': ['#'],
        'make': ['#'],
        'hpp': ['\/\/'],
        'conf': [';'],
        'aspx': ['\/\/'],
    }
    multi_comment_symbols = {
        'c': ['\/\*', '\*\/'],
        'cpp': ['\/\*', '\*\/'],
        'h': ['\/\*', '\*\/'],
        'cc': ['\/\*', '\*\/'],
        'php': ['\/\*', '\*\/'],
        'html': ['<!--', '-->'],
        'css': ['\/\*', '\*\/'],
        'js': ['\/\*', '\*\/'],
        'java': ['\/\*', '\*\/'],
        'cs': ['\/\*', '\*\/'],
        'sql': ['\/\*', '\*\/'],
        'py': ['"""', '"""'],
        'RC': ['\/\*', '\*\/'],
        'ixx': ['\/\*', '\*\/'],
        'ts': ['\/\*', '\*\/'],
        'hpp': ['\/\*', '\*\/'],
        'aspx': ['<!--', '-->'],
    }

    commented_lines = []
    found_comments = False

    if extension in single_comment_symbols:
        found_comments = True
        comment_symbols = single_comment_symbols[extension]
        for comment_symbol in comment_symbols:
            matches = re.findall(f"{comment_symbol}(.*)", file_text)
            # Loop through each match
            for match in matches:
                commented_lines.append(match)
    if extension in multi_comment_symbols:
        found_comments = True
        comment_symbols = multi_comment_symbols[extension]
        start_symbol = comment_symbols[0]
        end_symbol = comment_symbols[1]
        matches = re.findall(start_symbol + '.*' + end_symbol, file_text)
        for match in matches:
            commented_lines.append(match)

    if not found_comments:
        return file_text

    return "\n".join(commented_lines)

In [45]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
def remove_stop_words(document):
    words = document.split()
    filtered_words = [word for word in words if word not in stop_words]
    filtered_document = ' '.join(filtered_words)
    return filtered_document

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
def get_documents(source_dir):
    file_paths = get_file_paths(source_dir)
    documents = []
    for path in tqdm(file_paths):
        try:
            text = get_file_text(path)
            #optimized_text = optimize_file_text(text[0], text[1])
            filtered_text = remove_stop_words(text[0])
            documents.append(filtered_text)
        except:
            pass
    return documents

In [47]:
file_paths = get_file_paths('fossology-master')

In [48]:
documents = get_documents('fossology-master')

100%|██████████| 4400/4400 [00:18<00:00, 242.66it/s] 


In [49]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

In [50]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['00', '000', '0000', ..., 'ﬂooded', 'ﬂush', 'ﬂushes'], dtype=object)

In [22]:
index = feature_names.searchsorted('000c7fc2d306b8f74db81d1b44ce9a96da4cyearyear14cbe4e9d6b82db45aa7')
index

2

In [24]:
row = np.nonzero(X[:, index])[0]
row

array([4293], dtype=int32)

In [28]:
doc_num = row[0]
doc_num

4293

In [30]:
file_paths[doc_num]

'fossology-master/install/scripts/pgsql-conf-fix.sh'

In [51]:
model = LatentDirichletAllocation(n_components=2, random_state=0)
model.fit(X)

In [52]:
topic_word = model.components_
topic_word

array([[1.53109980e+04, 5.50906448e+01, 2.53034990e+04, ...,
        1.47579270e+00, 3.47481815e+00, 1.47579270e+00],
       [3.80019574e+01, 1.69093552e+01, 5.01015501e-01, ...,
        5.24207298e-01, 5.25181846e-01, 5.24207298e-01]])

In [53]:
doc_topic = model.transform(X)
doc_topic

array([[9.99998019e-01, 1.98132442e-06],
       [1.07790481e-01, 8.92209519e-01],
       [3.49132616e-02, 9.65086738e-01],
       ...,
       [7.29470730e-01, 2.70529270e-01],
       [2.26704897e-02, 9.77329510e-01],
       [3.50030840e-01, 6.49969160e-01]])

In [54]:
vocab = vectorizer.get_feature_names_out()

In [55]:
def print_top_words(topic_word, vocab, n_top_words):
    for i, topic_dist in enumerate(topic_word):
        sorted_indices = np.argsort(topic_dist)[::-1]
        top_words = [vocab[index] for index in sorted_indices[:n_top_words]]
        print(f"Topic {i}: {' '.join(top_words)}")

In [56]:
print_top_words(topic_word, vocab, 10)

Topic 0: license this the software copyright 0000 or work you use
Topic 1: fossology master filechecksum noassertion tar src zip file testdata agent_tests


In [57]:
topic_labels = {0: "Copyright", 1: "No Copyright"}
def label_topics(doc_topic, topic_labels):
    labels = []
    for doc_dist in doc_topic:
        max_index = np.argmax(doc_dist)
        label = topic_labels[max_index]
        labels.append(label)
    return labels

In [58]:
labels = label_topics(doc_topic, topic_labels)
labels

['Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'Copyright',
 'No Copyright',
 'Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'Copyright',
 'Copyright',
 'Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No Copyright',
 'No C

In [59]:
for i in range(10):
    print(f"Document {i}: {documents[i][:50]}... ({labels[i]})")

Document 0: <?xml version="1.0" encoding="utf-8" ?> <!-- SPDX-... (Copyright)
Document 1: # FOSSology Dockerfile # SPDX-FileCopyrightText: ©... (No Copyright)
Document 2: <?php /* SPDX-FileCopyrightText: © Fossology contr... (No Copyright)
Document 3: <?php /* SPDX-FileCopyrightText: © 2022 Siemens AG... (Copyright)
Document 4: <?php /* SPDX-FileCopyrightText: © 2022 Siemens AG... (Copyright)
Document 5: {# SPDX-FileCopyrightText: © 2022 Siemens AG SPDX-... (No Copyright)
Document 6: <?php /* SPDX-FileCopyrightText: © Fossology contr... (No Copyright)
Document 7: <?php /* Author: Gaurav Mishra <mishra.gaurav@siem... (Copyright)
Document 8: <?php /* Author: Gaurav Mishra <mishra.gaurav@siem... (Copyright)
Document 9: <?php /* SPDX-FileCopyrightText: © 2022 Siemens AG... (No Copyright)
