This repository has been archived by the owner on Aug 29, 2023. It is now read-only.
generated from IBM/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 3
/
scan.go
235 lines (204 loc) · 8.21 KB
/
scan.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
// SPDX-License-Identifier: Apache-2.0
package scanner
import (
"fmt"
"strings"
"github.com/spf13/pflag"
"github.com/IBM/license-scanner/configurer"
"github.com/IBM/license-scanner/identifier"
"github.com/IBM/license-scanner/licenses"
"github.com/IBM/license-scanner/normalizer"
)
// NOASSERTION_SPDX_NAME in License SPDX Name signify that the license text passed through the scan without any errors but no match was found
const NOASSERTION_SPDX_NAME = "NOASSERTION"
// ScanSpecs holds the package manager, the programming language, and a list of multiple packages with their specifications
type ScanSpecs struct {
// package manager to search for
// This is the standard package manager, for example, pypi for python, npm for nodejs, etc
PackageManager string
// programming language to search for
Language string
// a list of scan specification
// for a single package manager or a language, specify a list of packages with their respective specifications
Specs []ScanSpec
// config flag set
flags *pflag.FlagSet
}
// ScanSpec holds the specifications used for scanning the incoming package/file
type ScanSpec struct {
// file name or package name to search for.
// This will also be matched against known package URL or known file names. If a match is found, the canonical name will be returned in the ScanResult.
Name string
// package version number to search for.
// If no version is provided, the scanning service defaults to the package manager default which is mostly the latest version.
Version string
// location from where the file can be retrieved or a package can be downloaded.
// If no location is provided, the package source location is retrieved from the package manager.
// TODO: Resolve - Can we get the file content from the file system or should that be included as part of the specification?
Location string
// Package URL to search for.
// This is the standardized URL used to identify and locate a software package across many programming languages and package managers.
PURL string
// file hash or package hash to search for.
// This will also be matched against known file hashes.
// TODO: Create a proposal for hashing algorithm of a package.
Hash *normalizer.Digest
// license input text to match and identify the license against the data set
LicenseText string
}
// LicenseChoice is a collection of a License info with expression
// either license or expression must be set, but not both
// CycloneDX defines the LicenseChoice is defined here:
// https://github.com/CycloneDX/cyclonedx-go/blob/7d9a5619d767a252b454e8554d0fc986796ef958/cyclonedx.go#L462-L465
type LicenseChoice struct {
License *License
Expression string
}
// License is a collection of SPDX ID, name, license text, and license URL
// CycloneDX license struct defined here:
// https://github.com/CycloneDX/cyclonedx-go/blob/7d9a5619d767a252b454e8554d0fc986796ef958/cyclonedx.go#L389-L394
type License struct {
ID string
Name string
Text *AttachedText
URL string
}
// AttachedText holds the formatted License Text
// CycloneDX AttachedText is defined here:
// https://github.com/CycloneDX/cyclonedx-go/blob/7d9a5619d767a252b454e8554d0fc986796ef958/cyclonedx.go#L52-L56
type AttachedText struct {
Content string
ContentType string
Encoding string
}
type Licenses []LicenseChoice
// ScanResult holds the license identification results for a given package
type ScanResult struct {
// the specification from the user to perform the scan
Spec ScanSpec
// source text which matched against the SPDX License Data
OriginalText string
// normalized version of the source text which is compared against the license text
NormalizedText string
// file hash or package hash
// set to the hash if provided or calculate based on the input text (normalized)
Hash *normalizer.Digest
// error reported during the scan - includes empty license text or too large license text etc
Error error
// a list of LicenseMatch i.e. a list of SPDX license IDs in sequential order, the matches of the input text across the various licenses
CycloneDXLicenses Licenses
}
// WithConfig sets the config to use for the scan
func (s *ScanSpecs) WithFlags(flags *pflag.FlagSet) *ScanSpecs {
s.flags = flags
return s
}
// ScanLicenseText scans the specified license file to retrieve license information
func (s *ScanSpecs) ScanLicenseText() ([]*ScanResult, error) {
cfg, err := configurer.InitConfig(s.flags)
if err != nil {
return nil, err
}
licenseLibrary, err := licenses.NewLicenseLibrary(cfg)
if err != nil {
return nil, err
}
// initialize the license data set to compare against
if err := licenseLibrary.AddAll(); err != nil {
return nil, err
}
var r []*ScanResult
// resultsCache is a local cache holding the results of scanned license text
// this cache is searched before every scan to get the scan results if they exist
// this cache is updated after every new license match found
resultsCache := make(map[normalizer.Digest]*ScanResult)
for _, p := range s.Specs {
// identify license information for the specified license text
scanResult := p.ScanLicenseText(licenseLibrary, resultsCache)
r = append(r, scanResult)
}
return r, nil
}
// ScanLicenseText scans the specified license file to retrieve license information
func (s *ScanSpec) ScanLicenseText(licenseLibrary *licenses.LicenseLibrary, resultsCache map[normalizer.Digest]*ScanResult) *ScanResult {
// create a scanResult with the specifications and licenseText
r := &ScanResult{
Spec: *s,
OriginalText: s.LicenseText,
CycloneDXLicenses: Licenses{},
}
// instantiate normalizedData with the input license text
normalizedData := normalizer.NormalizationData{
OriginalText: s.LicenseText,
}
// normalize the input license text
if err := normalizedData.NormalizeText(); err != nil {
r.Error = err
return r
}
// set the normalized text and hashes
r.NormalizedText = normalizedData.NormalizedText
r.Hash = &normalizedData.Hash
// check the cache in memory if we have seen the same license before
// return the result if it exists in the cache to avoid running identification for it
if cachedResult, ok := resultsCache[*r.Hash]; ok {
return cachedResult
}
// find the licenses in the normalized text and return a list of SPDX IDs
// in case of an error, return as much as we have along with an error
results, err := identifier.Identify(identifier.Options{}, licenseLibrary, normalizedData)
if err != nil {
r.Error = err
return r
}
// if the results are empty, add unknown as the SPDX ID
if len(results.Matches) == 0 {
// Add NOASSERTION to the LicenseChoice of the SPDX Name for this scan
r.CycloneDXLicenses = append(r.CycloneDXLicenses, LicenseChoice{
License: &License{
Name: NOASSERTION_SPDX_NAME,
},
})
} else {
// iterate over the list of matches and maintain the unique list of SPDX IDs in the result
for id := range results.Matches {
// Add an SPDX ID from the match
// update the LicenseChoice to include each new match
// Add suffix of (family) to the name, if we have a family
family := licenseLibrary.LicenseMap[id].LicenseInfo.Family
name := licenseLibrary.LicenseMap[id].LicenseInfo.Name
if family != "" {
name = fmt.Sprintf("%s (%s)", name, family)
}
r.CycloneDXLicenses = append(r.CycloneDXLicenses, LicenseChoice{
License: &License{
ID: id,
Name: name,
// TODO: verify whether this is acceptable or just expect a single license here
URL: strings.Join(licenseLibrary.LicenseMap[id].LicenseInfo.URLs, ","),
Text: &AttachedText{
Content: licenseLibrary.LicenseMap[id].Text.Content,
ContentType: licenseLibrary.LicenseMap[id].Text.ContentType,
Encoding: licenseLibrary.LicenseMap[id].Text.Encoding,
},
},
})
}
}
// populate the results cache to keep the match in memory for next license match
resultsCache[*r.Hash] = r
return r
}
// ScanFile looks up a specific file by name to retrieve license data.
// If the license data is not available, scan the specified file,
// persist the scanned result into a datastore, and return the license data.
func (s *ScanSpecs) ScanFile() []*ScanResult {
var r []*ScanResult
// identify license information for each specified package
for _, p := range s.Specs {
r = append(r, &ScanResult{
Spec: p,
})
}
return r
}