In [1]:
import pandas as pd

# Load silver datasets

Load Alien Vault silver dataset.

In [2]:
df_alien_vault = pd.read_csv("silver/alien_vault/alien_vault.csv", sep=';')
df_alien_vault.head()

Unnamed: 0,id,names,descriptions,TLPs,threat_status
0,0.client-channel.google.com,23.219.89.169 dty-274d7ae9-e5e0-48eb-80db-f8d...,174.bm-nginx-loadbalancer.mgmt.sin1.adnexus.n...,['white'],whitelist
1,17track.net,Remote Network Attack | JakyllHyde: Malicious ...,Research shows compromise originated from Sabe...,['green'],whitelist
2,1drv.com,DarkWatchman Chekin Activity Order Brian Sabe...,Brian Sabey & large team continue excessive ...,['green'],whitelist
3,25z5g623wpqpdwis.onion.to,IOC Records Provided by @NextRayAI IOCs Indust...,This IOC report provided and daily updated by ...,['white'],malicious
4,27lelchgcvs2wpm7.3lhjyx.top,TomkompSerwis 5b685b6fd0c356b8389e33596a40c6...,Dŵr dysku zewnętrznego wedi cymryd i'wodraeth ...,['white'],malicious


Load VirusTotal silver dataset.

In [3]:
df_virus_total = pd.read_csv("silver/virus_total/virus_total.csv", sep=';')
df_virus_total.head()

Unnamed: 0,root_id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,votes_malicious,tags,threat_status,malicious,suspicious,malware,phishing,not recommended,spam,ranks
0,imagebam.com,Admin City: Praha 4\nAdmin Country: CZ\nAdmin ...,0,com,GRANSY S.R.O D/B/A SUBREG.CZ,1757077000.0,1758527000.0,1726741000.0,1158911000.0,1,...,0,,whitelist,1,0,0,0,0,0,"[1152, 20000, 176451, 4872, 8212]"
1,images-amazon.com,Creation Date: 2004-07-20T23:52:20+0000\nCreat...,0,com,MarkMonitor Inc.,1757080000.0,1776312000.0,1738346000.0,1090368000.0,0,...,0,,whitelist,0,0,0,0,0,0,"[5000, 20354, 273404, 3166]"
2,images.dmca.com,Creation Date: 2000-05-18T12:11:02Z\nDNSSEC: u...,0,com,"GoDaddy.com, LLC",1757082000.0,2031567000.0,1732726000.0,958651900.0,0,...,0,,whitelist,0,0,0,0,0,0,[17899]
3,images.dwell.com,Creation Date: 1999-03-18T05:00:00Z\nDNSSEC: u...,0,com,"NameCheap, Inc.",1757001000.0,1981432000.0,1665813000.0,921733200.0,0,...,0,,whitelist,0,0,0,0,0,0,"[491648, 770756]"
4,images.netdirector.co.uk,Expiry date: 25-Nov-2026\nLast updated: 07-Oct...,0,co.uk,,1757068000.0,,,,0,...,0,,whitelist,0,0,0,0,0,0,[240790]


# Merge datasets
This code merges two DataFrames (`df_virus_total` and `df_alien_vault`) using the `root_id` column from the first and the `id` column from the second, creating a **combined DataFrame**.  

The `how="outer"` parameter specifies an **outer join**, meaning **all rows from both DataFrames are kept**; if there is no match, missing values are filled with `NaN`.

In [4]:
df_merged = pd.merge(
    df_virus_total,
    df_alien_vault,
    left_on="root_id",
    right_on="id",
    how="inner"
)

# Data cleaning

Duplicate columns are removed, and columns are **renamed** to have **cleaner, more readable names**.

In [5]:
df_merged = df_merged.drop(['threat_status_x', 'id'], axis=1)

In [6]:
df_merged = df_merged.rename(columns={'root_id': 'id', 'threat_status_y': 'threat_status', 'TLPs': 'tlps'})

In [7]:
df_merged.columns

Index(['id', 'whois', 'reputation', 'tld', 'registrar',
       'last_modification_date', 'expiration_date', 'last_update_date',
       'creation_date', 'stats_malicious', 'stats_suspicious',
       'stats_undetected', 'stats_harmless', 'stats_timeout', 'votes_harmless',
       'votes_malicious', 'tags', 'malicious', 'suspicious', 'malware',
       'phishing', 'not recommended', 'spam', 'ranks', 'names', 'descriptions',
       'tlps', 'threat_status'],
      dtype='object')

In [8]:
df_merged.head()

Unnamed: 0,id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,suspicious,malware,phishing,not recommended,spam,ranks,names,descriptions,tlps,threat_status
0,images-amazon.com,Creation Date: 2004-07-20T23:52:20+0000\nCreat...,0,com,MarkMonitor Inc.,1757080000.0,1776312000.0,1738346000.0,1090368000.0,0,...,0,0,0,0,0,"[5000, 20354, 273404, 3166]",icon.palantirfoundry.com - Brazzers Porn Remot...,Another strange pulse. Persistent bad actors m...,"['green', 'white']",whitelist
1,images.dmca.com,Creation Date: 2000-05-18T12:11:02Z\nDNSSEC: u...,0,com,"GoDaddy.com, LLC",1757082000.0,2031567000.0,1732726000.0,958651900.0,0,...,0,0,0,0,0,[17899],Android Remotely Cracked: Swipper? | Being Sab...,Targets phone and other devices cracked remote...,['green'],whitelist
2,imasdk.googleapis.com,Creation Date: 2005-01-25T08:00:00+0000\nCreat...,-1,com,MarkMonitor Inc.,1757081000.0,1769364000.0,1735035000.0,1106676000.0,0,...,0,0,0,0,0,"[26255, 23430, 567]",WhinySuckBaby The Best Buy Virus - Spreads Via...,Files from a Virus that has plagued my life fo...,"['green', 'white']",whitelist
3,imgbox.com,Admin City: Praha 4\nAdmin Country: CZ\nAdmin ...,-1,com,GRANSY S.R.O D/B/A SUBREG.CZ,1757076000.0,1770666000.0,1742947000.0,1717286000.0,0,...,0,0,0,0,0,"[5517, 5040, 10000, 7220, 66875]",Foundry • Reflected Networks Pornhub Malvertis...,Foundry ? Pornhub\nsanfoundry.com\ncompliance...,"['green', 'white']",whitelist
4,imgix.net,Admin City: REDACTED FOR PRIVACY\nAdmin Countr...,0,net,GANDI SAS,1757083000.0,1845338000.0,1747697000.0,1308794000.0,0,...,0,0,0,0,0,"[3808, 11204, 1733, 2000, 6324]",Endgame 4 | ThreatIntelligence | Pegasus | Gra...,"Do not access iOCs under any circumstances, ex...","['green', 'white']",whitelist


In [9]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021 entries, 0 to 3020
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3021 non-null   object 
 1   whois                   1328 non-null   object 
 2   reputation              3021 non-null   int64  
 3   tld                     3021 non-null   object 
 4   registrar               1117 non-null   object 
 5   last_modification_date  3021 non-null   float64
 6   expiration_date         1095 non-null   float64
 7   last_update_date        1149 non-null   float64
 8   creation_date           1149 non-null   float64
 9   stats_malicious         3021 non-null   int64  
 10  stats_suspicious        3021 non-null   int64  
 11  stats_undetected        3021 non-null   int64  
 12  stats_harmless          3021 non-null   int64  
 13  stats_timeout           3021 non-null   int64  
 14  votes_harmless          3021 non-null   

In [16]:
df_merged['threat_status'].value_counts()

threat_status
malicious    1879
whitelist    1142
Name: count, dtype: int64

# Save gold CSV

In [10]:
df_merged.to_csv('gold/gold.csv', sep=';', index=False)