In [1]:
import pandas as pd

# Load silver datasets

Load Alien Vault silver dataset.

In [2]:
df_alien_vault = pd.read_csv("silver/alien_vault/alien_vault.csv", sep=';')
df_alien_vault.head()

Unnamed: 0,id,names,descriptions,TLPs,threat_status
0,0.client-channel.google.com,23.219.89.169 dty-274d7ae9-e5e0-48eb-80db-f8d...,174.bm-nginx-loadbalancer.mgmt.sin1.adnexus.n...,['white'],whitelist
1,17track.net,Remote Network Attack | JakyllHyde: Malicious ...,Research shows compromise originated from Sabe...,['green'],whitelist
2,1drv.com,DarkWatchman Chekin Activity Order Brian Sabe...,Brian Sabey & large team continue excessive ...,['green'],whitelist
3,25z5g623wpqpdwis.onion.to,IOC Records Provided by @NextRayAI IOCs Indust...,This IOC report provided and daily updated by ...,['white'],malicious
4,27lelchgcvs2wpm7.3lhjyx.top,TomkompSerwis 5b685b6fd0c356b8389e33596a40c6...,Dŵr dysku zewnętrznego wedi cymryd i'wodraeth ...,['white'],malicious


Load VirusTotal silver dataset.

In [3]:
df_virus_total = pd.read_csv("silver/virus_total/virus_total.csv", sep=';')
df_virus_total.head()

Unnamed: 0,root_id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,stats_undetected,stats_harmless,stats_timeout,votes_harmless,votes_malicious,categories,tags,threat_status,results,ranks
0,0.client-channel.google.com,Creation Date: 1997-09-15T04:00:00Z\nCreation ...,0,com,MarkMonitor Inc.,1754931941,1852517000.0,1722565000.0,874306800.0,0,...,30,64,0,0,0,"['harmless', 'undetected']",,whitelist,"['clean', 'unrated']",[992903]
1,17track.net,Creation Date: 2011-03-09T01:44:43Z\nDNSSEC: u...,0,net,Alibaba Cloud Computing Ltd. d/b/a HiChina (ww...,1756732487,1867711000.0,1746083000.0,1299635000.0,0,...,29,65,0,0,0,"['harmless', 'undetected']",,whitelist,"['clean', 'unrated']","[38944, 865, 10000, 1532, 9502]"
2,1drv.com,Creation Date: 2013-08-05T18:33:50+0000\nCreat...,-58,com,MarkMonitor Inc.,1756734594,1785955000.0,1751628000.0,1375728000.0,0,...,29,65,0,0,4,"['harmless', 'undetected']",,whitelist,"['clean', 'unrated']","[1000, 295850, 1873, 20789, 44693]"
3,1fichier.com,Admin City: REDACTED FOR PRIVACY\nAdmin Countr...,1,com,ONLINE SAS,1756735976,1765109000.0,1741239000.0,1260188000.0,1,...,29,64,0,1,0,"['harmless', 'malicious', 'undetected']",,whitelist,"['clean', 'unrated', 'malicious']","[20000, 647, 257267, 49174, 7513]"
4,1und1.de,Changed: 2017-05-11T09:15:31+02:00\nDomain: 1u...,1,de,,1756733769,,,,0,...,29,65,0,1,0,"['harmless', 'undetected']",,whitelist,"['clean', 'unrated']","[5000, 5963, 23995, 10222]"


# Merge datasets
This code merges two DataFrames (`df_virus_total` and `df_alien_vault`) using the `root_id` column from the first and the `id` column from the second, creating a **combined DataFrame**.  

The `how="outer"` parameter specifies an **outer join**, meaning **all rows from both DataFrames are kept**; if there is no match, missing values are filled with `NaN`.

In [4]:
df_merged = pd.merge(
    df_virus_total,
    df_alien_vault,
    left_on="root_id",
    right_on="id",
    how="outer"
)

# Data cleaning

Duplicate columns are removed, and columns are **renamed** to have **cleaner, more readable names**.

In [5]:
df_merged = df_merged.drop(['threat_status_x', 'id'], axis=1)

In [6]:
df_merged = df_merged.rename(columns={'root_id': 'id', 'threat_status_y': 'threat_status', 'TLPs': 'tlps'})

In [7]:
df_merged.columns

Index(['id', 'whois', 'reputation', 'tld', 'registrar',
       'last_modification_date', 'expiration_date', 'last_update_date',
       'creation_date', 'stats_malicious', 'stats_suspicious',
       'stats_undetected', 'stats_harmless', 'stats_timeout', 'votes_harmless',
       'votes_malicious', 'categories', 'tags', 'results', 'ranks', 'names',
       'descriptions', 'tlps', 'threat_status'],
      dtype='object')

In [8]:
df_merged.head()

Unnamed: 0,id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,votes_harmless,votes_malicious,categories,tags,results,ranks,names,descriptions,tlps,threat_status
0,0.client-channel.google.com,Creation Date: 1997-09-15T04:00:00Z\nCreation ...,0.0,com,MarkMonitor Inc.,1754932000.0,1852517000.0,1722565000.0,874306800.0,0.0,...,0.0,0.0,"['harmless', 'undetected']",,"['clean', 'unrated']",[992903],23.219.89.169 dty-274d7ae9-e5e0-48eb-80db-f8d...,174.bm-nginx-loadbalancer.mgmt.sin1.adnexus.n...,['white'],whitelist
1,17track.net,Creation Date: 2011-03-09T01:44:43Z\nDNSSEC: u...,0.0,net,Alibaba Cloud Computing Ltd. d/b/a HiChina (ww...,1756732000.0,1867711000.0,1746083000.0,1299635000.0,0.0,...,0.0,0.0,"['harmless', 'undetected']",,"['clean', 'unrated']","[38944, 865, 10000, 1532, 9502]",Remote Network Attack | JakyllHyde: Malicious ...,Research shows compromise originated from Sabe...,['green'],whitelist
2,1drv.com,Creation Date: 2013-08-05T18:33:50+0000\nCreat...,-58.0,com,MarkMonitor Inc.,1756735000.0,1785955000.0,1751628000.0,1375728000.0,0.0,...,0.0,4.0,"['harmless', 'undetected']",,"['clean', 'unrated']","[1000, 295850, 1873, 20789, 44693]",DarkWatchman Chekin Activity Order Brian Sabe...,Brian Sabey & large team continue excessive ...,['green'],whitelist
3,1fichier.com,Admin City: REDACTED FOR PRIVACY\nAdmin Countr...,1.0,com,ONLINE SAS,1756736000.0,1765109000.0,1741239000.0,1260188000.0,1.0,...,1.0,0.0,"['harmless', 'malicious', 'undetected']",,"['clean', 'unrated', 'malicious']","[20000, 647, 257267, 49174, 7513]",IOC Records Provided by @NextRayAI IOCs Indust...,This IOC report provided and daily updated by ...,['white'],whitelist
4,1und1.de,Changed: 2017-05-11T09:15:31+02:00\nDomain: 1u...,1.0,de,,1756734000.0,,,,0.0,...,1.0,0.0,"['harmless', 'undetected']",,"['clean', 'unrated']","[5000, 5963, 23995, 10222]",,,,


In [9]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3251 entries, 0 to 3250
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      499 non-null    object 
 1   whois                   496 non-null    object 
 2   reputation              499 non-null    float64
 3   tld                     499 non-null    object 
 4   registrar               415 non-null    object 
 5   last_modification_date  499 non-null    float64
 6   expiration_date         416 non-null    float64
 7   last_update_date        416 non-null    float64
 8   creation_date           416 non-null    float64
 9   stats_malicious         499 non-null    float64
 10  stats_suspicious        499 non-null    float64
 11  stats_undetected        499 non-null    float64
 12  stats_harmless          499 non-null    float64
 13  stats_timeout           499 non-null    float64
 14  votes_harmless          499 non-null    

# Save gold CSV

In [10]:
df_merged.to_csv('gold/gold.csv', sep=';', index=False)