In [4]:
import pandas as pd

In [12]:
df = pd.read_json('dataset.json')
df[df['fp'] == 1].head()

Unnamed: 0,id,code_snippet_before,code_snippet_after,secret,fp,repo_path,commit_hash,author,author_email,rule_name,file_path,line,detected_at,confidence,reason
1,2,AWS_SECRET=,,AKIAIOSFODNN7EXAMPLE,1,https://github.com/company/aws-tools,f6g7h8i9j0,Jane Smith,jane@example.com,AWS Access Key,config/credentials.env,15,2023-06-20 10:22:45+00:00,Medium,Example AWS key from documentation
4,5,"password = """,""" # temporary password",letmein123,1,https://github.com/company/test-repo,u1v2w3x4y5,Mike Brown,mike@example.com,Generic Password,scripts/test.py,33,2023-06-20 13:55:27+00:00,Low,Simple password in test script
7,8,// Test API key -,- don't commit to prod,test_abcdef1234567890,1,https://github.com/company/testing,j6k7l8m9n0,Lisa Taylor,lisa@example.com,Test API Key,tests/config.js,7,2023-06-20 16:30:33+00:00,Medium,Explicitly marked as test key
10,11,// Demo only:,,demo_password_123,1,https://github.com/company/demo,y1z2a3b4c5,Thomas White,thomas@example.com,Demo Password,demo/config.json,8,2023-06-20 19:05:47+00:00,Low,Clearly marked as demo credential
13,14,// Example:,,example_aws_key,1,https://github.com/company/docs,n6o7p8q9r0,Patricia Harris,patricia@example.com,Example Key,examples/config.example,5,2023-06-20 22:35:29+00:00,Low,Clearly marked as example


In [14]:
test_df = df[['id', 'code_snippet_before', 'code_snippet_after',
              'secret', 'repo_path', 'commit_hash', 'author_email', 'rule_name',
              'file_path', 'line', 'detected_at']]

In [27]:
def generate_prompt(data):
    prompt = """
**Role**: Act as a senior information security specialist performing automated secret scanning validation with full context awareness.

**Task**: Evaluate each JSON entry's secret field against these criteria:
1. **Secret Characteristics**:
   - Entropy analysis (length ≥15 chars, char diversity for high entropy)
   - Contextual validation (matches secret type expectations)
   - Exclusion of test patterns/placeholders
2. **Metadata Context**:
   - `repo_path`/`file_path` patterns (e.g., test/, examples/)
   - `snippet` context analysis
   - `rule_name` alignment with secret type
3. **Historical Risk**:
   - `author_email` domain reputation
   - `detected_at` recency consideration

**Output Requirements**:
- Strict JSON format new fields:
  - `"False-positive"`: `0` (valid) or `1` (invalid)
  - `"Confidence"`: `"High"`/`"Medium"`/`"Low"`
  - `"Reason"`: Required for false positives (max 15 words)
- No additional commentary

**Input Example**:
[
  {
      "id": 1,
      "repo_path": "https://github.com/company/prod-repo",
      "commit_hash": "a1b2c3d",
      "author": "John Doe",
      "author_email": "john@company.com",
      "rule_name": "High Entropy String Detector",
      "secret": "fds1$ko90EM",
      "snippet": "const apiKey = 'fds1$ko90EM'",
      "file_path": "src/auth/config.js",
      "line": 42,
      "detected_at": "2023-05-15T14:32:11Z"
    }
]

**Output Examples**:
[{
    "False-positive": 0,
    "Confidence": "High",
    "Reason": "Real secret"
}
]
[{
    "False-positive": 1,
    "Confidence": "Medium",
    "Reason": "Mockfile"
}
]


**Validation Rules**:
||| High Confidence Valid Secrets (False-positive=0) |||
- High entropy strings (>64 bits) in production paths
- Cryptographic keys with proper prefixes (ssh-rsa, BEGIN PRIVATE KEY)
- Cloud provider patterns (AKIA[0-9A-Z]{16})

||| False-positive=1 When |||
1. File path contains: /test/, /examples/, /mocks/
2. Snippet shows test context (e.g., "example_key")
3. Author email from test domains (example.com, test.org)
4. Rule mismatch (e.g., "Keyword" rule detecting 32-char hex)

Give output only list of dicts! No comments, no explanation.

**Process this JSON**:
""" + data
    return prompt

In [29]:
import requests

def ask_ollama(data, model="mistral"):
    prompt = generate_prompt(data)
    
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.1,
                "top_p": 0.9,
                "max_tokens": 500,
            }
        }
    )

    return response.json()["response"]

In [31]:
parsed_data = []

for i, test_case in test_df.iterrows():
    test_case = test_case.to_json(orient='index')
    answer = ask_ollama(test_case)
    json_string = answer.strip()

    if '<think>' in json_string:
        json_string = json_string.split('</think>\n')[1]
        
    if json_string.startswith("```json") and json_string.endswith("```"):
        json_string = json_string[7:-3].strip()

    parsed_data.append({
                        **json.loads(json_string)[0],
                        **json.loads(test_case)
                       })

In [39]:
tested_df = pd.DataFrame(parsed_data)
tested_df[tested_df['False-positive'] == 1].head()

Unnamed: 0,False-positive,Confidence,Reason,id,code_snippet_before,code_snippet_after,secret,repo_path,commit_hash,author_email,rule_name,file_path,line,detected_at
4,1,High,Test file,5,"password = """,""" # temporary password",letmein123,https://github.com/company/test-repo,u1v2w3x4y5,mike@example.com,Generic Password,scripts/test.py,33,1687269327000
7,1,High,Test API Key,8,// Test API key -,- don't commit to prod,test_abcdef1234567890,https://github.com/company/testing,j6k7l8m9n0,lisa@example.com,Test API Key,tests/config.js,7,1687278633000
10,1,High,Demo,11,// Demo only:,,demo_password_123,https://github.com/company/demo,y1z2a3b4c5,thomas@example.com,Demo Password,demo/config.json,8,1687287947000
13,1,High,Mockfile,14,// Example:,,example_aws_key,https://github.com/company/docs,n6o7p8q9r0,patricia@example.com,Example Key,examples/config.example,5,1687300529000
15,1,High,Test Configuration,16,"const testConfig = { apiKey: """,""" };",test_key_123456,https://github.com/company/tests,x6y7z8a9b0,linda@example.com,Test Configuration,test/helpers.js,13,1687308903000


In [41]:
rate = pd.merge(tested_df, df, on='id', how='inner')[['id', 'fp', 'False-positive']]

len(rate[rate['fp'] == rate['False-positive']]) / len(rate)

0.95