References to the documentation describing the structure of the JSON objects:  [Virustotal Domains Object](https://docs.virustotal.com/reference/domains-object)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
import os
import numpy as np
import pandas as pd

In [2]:
os.listdir("bronze/virus_total")

['virus_total_batch_2.ndjson',
 'virus_total_batch_4.ndjson',
 'virus_total_batch_8.ndjson',
 'virus_total_batch_6.ndjson',
 'virus_total_batch_3.ndjson',
 'virus_total_batch_1.ndjson',
 'virus_total_batch_5.ndjson',
 'virus_total_batch_9.ndjson',
 'virus_total_batch_7.ndjson']

# Load VirusTotal JSON files into Spark DataFrame
### last_analysis_results
Represents individual antivirus engine results.
- Key = engine name.

- Values: detection method, engine name, category (malicious, harmless, etc.), and result.

### last_analysis_stats
Overall detection statistics. Counts how many engines flagged the sample as: malicious, suspicious, undetected, harmless, or timeout.

### Main schema:

- last_https_certificate_date → Last SSL certificate date.

- last_analysis_date → Last VirusTotal analysis date.

- whois → WHOIS registration information.

- reputation → Reputation score assigned by VirusTotal.

- jarm → SSL/TLS fingerprint hash.

- last_analysis_stats → Aggregated detection statistics.

- last_analysis_results → Per-engine detection results.

- tld → Top-level domain (.com, .org, etc.).

- registrar → Domain registrar entity.

- popularity_ranks → Rankings from providers (e.g., Alexa, Cisco Umbrella).

- tags → Tags assigned to the resource.

- categories → Classification categories (e.g., phishing, malware).

- total_votes → Community votes (harmless vs malicious).

In [3]:
spark = SparkSession.builder.appName("VirusTotalIngest").getOrCreate()

from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, IntegerType,
    ArrayType, MapType
)

# last_analysis_results schema
last_analysis_results_schema = MapType(
    StringType(),
    StructType([
        StructField("method", StringType(), True),
        StructField("engine_name", StringType(), True),
        StructField("category", StringType(), True),
        StructField("result", StringType(), True),
    ])
)

# last_analysis_stats schema
last_analysis_stats_schema = StructType([
    StructField("malicious", IntegerType(), True),
    StructField("suspicious", IntegerType(), True),
    StructField("undetected", IntegerType(), True),
    StructField("harmless", IntegerType(), True),
    StructField("timeout", IntegerType(), True),
])

# Main schema
schema = StructType([
    StructField("id", StringType(), True),
    StructField("file_extracted", StringType(), True),
    StructField("response", StructType([
        StructField("data", StructType([
            StructField("id", StringType(), True),
            StructField("type", StringType(), True),
            StructField("links", StructType([
                StructField("self", StringType(), True),
            ]), True),
            StructField("attributes", StructType([
                StructField("last_https_certificate_date", LongType(), True),
                StructField("last_analysis_date", LongType(), True),
                StructField("whois", StringType(), True),
                StructField("reputation", IntegerType(), True),
                StructField("jarm", StringType(), True),
                StructField("last_analysis_stats", last_analysis_stats_schema, True),
                StructField("last_analysis_results", last_analysis_results_schema, True),
                StructField("tld", StringType(), True),
                StructField("registrar", StringType(), True),
                StructField("last_dns_records_date", LongType(), True),
                StructField("popularity_ranks", MapType(
                    StringType(),
                    StructType([
                        StructField("rank", LongType(), True),
                        StructField("timestamp", LongType(), True),
                    ])
                ), True),
                StructField("last_modification_date", LongType(), True),
                StructField("expiration_date", LongType(), True),
                StructField("tags", ArrayType(StringType()), True),
                StructField("last_update_date", LongType(), True),
                StructField("categories", MapType(StringType(), StringType()), True),
                StructField("creation_date", LongType(), True),
                StructField("total_votes", StructType([
                    StructField("harmless", IntegerType(), True),
                    StructField("malicious", IntegerType(), True),
                ]), True),
            ]), True),
        ]), True),
    ]), True),
])

df = spark.read.schema(schema).json("bronze/virus_total/*.ndjson")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/10 19:17:50 WARN Utils: Your hostname, MacBook-Pro-de-Macia.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.138 instead (on interface en0)
25/09/10 19:17:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 19:17:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/10 19:17:53 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: bronze/virus_total/*.ndjson.
java.io.FileNotFoundException: File bronze/virus_total/*.ndjson does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileS

At a high level, this code flattens and restructures the nested VirusTotal JSON data into a cleaner, tabular format suitable for analysis in Spark. In short, it transforms the raw, deeply nested VirusTotal JSON into a flat schema with columns ready for querying, aggregations, and exporting

In [4]:
from pyspark.sql.functions import col, to_json, concat_ws

df_flat = df.select(
    col("id").alias("root_id"),
    col("file_extracted"),
    col("response.data.id").alias("domain_id"),
    col("response.data.type").alias("domain_type"),
    col("response.data.links.self").alias("domain_link"),
    
    # atributos simples
    col("response.data.attributes.last_https_certificate_date"),
    col("response.data.attributes.last_analysis_date"),
    col("response.data.attributes.whois"),
    col("response.data.attributes.reputation"),
    col("response.data.attributes.jarm"),
    col("response.data.attributes.tld"),
    col("response.data.attributes.registrar"),
    col("response.data.attributes.last_dns_records_date"),
    col("response.data.attributes.last_modification_date"),
    col("response.data.attributes.expiration_date"),
    col("response.data.attributes.last_update_date"),
    col("response.data.attributes.creation_date"),
    
    # flatten: last_analysis_stats
    col("response.data.attributes.last_analysis_stats.malicious").alias("stats_malicious"),
    col("response.data.attributes.last_analysis_stats.suspicious").alias("stats_suspicious"),
    col("response.data.attributes.last_analysis_stats.undetected").alias("stats_undetected"),
    col("response.data.attributes.last_analysis_stats.harmless").alias("stats_harmless"),
    col("response.data.attributes.last_analysis_stats.timeout").alias("stats_timeout"),
    
    # flatten: total_votes
    col("response.data.attributes.total_votes.harmless").alias("votes_harmless"),
    col("response.data.attributes.total_votes.malicious").alias("votes_malicious"),
    
    # mantenemos last_analysis_results como JSON string
    to_json(col("response.data.attributes.last_analysis_results")).alias("last_analysis_results"),
    
    # flatten extra: popularity_ranks, categories, tags
    to_json(col("response.data.attributes.popularity_ranks")).alias("popularity_ranks"),
    to_json(col("response.data.attributes.categories")).alias("categories"),
    concat_ws(",", col("response.data.attributes.tags")).alias("tags")
)

df_flat.printSchema()
df_flat.show(1, truncate=False)

root
 |-- root_id: string (nullable = true)
 |-- file_extracted: string (nullable = true)
 |-- domain_id: string (nullable = true)
 |-- domain_type: string (nullable = true)
 |-- domain_link: string (nullable = true)
 |-- last_https_certificate_date: long (nullable = true)
 |-- last_analysis_date: long (nullable = true)
 |-- whois: string (nullable = true)
 |-- reputation: integer (nullable = true)
 |-- jarm: string (nullable = true)
 |-- tld: string (nullable = true)
 |-- registrar: string (nullable = true)
 |-- last_dns_records_date: long (nullable = true)
 |-- last_modification_date: long (nullable = true)
 |-- expiration_date: long (nullable = true)
 |-- last_update_date: long (nullable = true)
 |-- creation_date: long (nullable = true)
 |-- stats_malicious: integer (nullable = true)
 |-- stats_suspicious: integer (nullable = true)
 |-- stats_undetected: integer (nullable = true)
 |-- stats_harmless: integer (nullable = true)
 |-- stats_timeout: integer (nullable = true)
 |-- votes

25/09/10 19:17:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------------+-------------------------------------------------------+------------+-----------+------------------------------------------------------+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

This block of code takes the flattened VirusTotal dataset and defines the final labeled dataset with a new target column.

In [5]:
from pyspark.sql import functions as F

df_final = df_flat.withColumn(
    "threat_status",
    F.when(F.col("file_extracted").contains("black_list"), "malicious")
     .when(F.col("file_extracted").contains("white_list"), "whitelist")
     .otherwise("unknown")
)
df_final = df_final.toPandas()
df_final = df_final.drop(['file_extracted'], axis=1)
df_final.columns

                                                                                

Index(['root_id', 'domain_id', 'domain_type', 'domain_link',
       'last_https_certificate_date', 'last_analysis_date', 'whois',
       'reputation', 'jarm', 'tld', 'registrar', 'last_dns_records_date',
       'last_modification_date', 'expiration_date', 'last_update_date',
       'creation_date', 'stats_malicious', 'stats_suspicious',
       'stats_undetected', 'stats_harmless', 'stats_timeout', 'votes_harmless',
       'votes_malicious', 'last_analysis_results', 'popularity_ranks',
       'categories', 'tags', 'threat_status'],
      dtype='object')

In [6]:
print(df_final['tags'].unique())
print()
print(df_final['tags'].info())

['' 'nxdomain,potential-c2' 'nxdomain' 'dga' 'dynamic-dns' 'potential-c2'
 'hex' 'self-signed,dga' 'self-signed' 'dga,self-signed'
 'nxdomain,potential-c2,dga' 'hex,self-signed,dga' 'hex,dga'
 'nxdomain,dga']

<class 'pandas.core.series.Series'>
RangeIndex: 4148 entries, 0 to 4147
Series name: tags
Non-Null Count  Dtype 
--------------  ----- 
4148 non-null   object
dtypes: object(1)
memory usage: 32.5+ KB
None


In [7]:
df_final = df_final.drop(
    ['domain_type', 'jarm',
     'domain_id', 'domain_link', 'last_https_certificate_date',
     'last_analysis_date', 'last_dns_records_date']
    , axis=1)

df_final.head()

Unnamed: 0,root_id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,stats_undetected,stats_harmless,stats_timeout,votes_harmless,votes_malicious,last_analysis_results,popularity_ranks,categories,tags,threat_status
0,imagebam.com,Admin City: Praha 4\nAdmin Country: CZ\nAdmin ...,0,com,GRANSY S.R.O D/B/A SUBREG.CZ,1757077000.0,1758527000.0,1726741000.0,1158911000.0,1,...,32,62,0,0,0,"{""Acronis"":{""method"":""blacklist"",""engine_name""...","{""Majestic"":{""rank"":8212,""timestamp"":175699668...","{""alphaMountain.ai"":""Media Sharing, Mixed Cont...",,whitelist
1,images-amazon.com,Creation Date: 2004-07-20T23:52:20+0000\nCreat...,0,com,MarkMonitor Inc.,1757080000.0,1776312000.0,1738346000.0,1090368000.0,0,...,30,65,0,0,0,"{""Acronis"":{""method"":""blacklist"",""engine_name""...","{""Majestic"":{""rank"":3166,""timestamp"":175699668...","{""alphaMountain.ai"":""Shopping (alphaMountain.a...",,whitelist
2,images.dmca.com,Creation Date: 2000-05-18T12:11:02Z\nDNSSEC: u...,0,com,"GoDaddy.com, LLC",1757082000.0,2031567000.0,1732726000.0,958651900.0,0,...,30,65,0,0,0,"{""Acronis"":{""method"":""blacklist"",""engine_name""...","{""Cisco Umbrella"":{""rank"":17899,""timestamp"":17...",{},,whitelist
3,images.dwell.com,Creation Date: 1999-03-18T05:00:00Z\nDNSSEC: u...,0,com,"NameCheap, Inc.",1757001000.0,1981432000.0,1665813000.0,921733200.0,0,...,32,63,0,0,0,"{""Acronis"":{""method"":""blacklist"",""engine_name""...","{""Majestic"":{""rank"":491648,""timestamp"":1590939...",{},,whitelist
4,images.netdirector.co.uk,Expiry date: 25-Nov-2026\nLast updated: 07-Oct...,0,co.uk,,1757068000.0,,,,0,...,32,63,0,0,0,"{""Acronis"":{""method"":""blacklist"",""engine_name""...","{""Cisco Umbrella"":{""rank"":240790,""timestamp"":1...",{},,whitelist


# Flattening and Cleaning JSON Columns

## last_analysis_results column
- last_analysis_results: <dictionary> result from URL scanners. dict with scanner name as key and a dict with notes/result from that scanner as value.
    - category: <string> normalised result. can be:
    - "harmless" (site is not malicious),
    - "undetected" (scanner has no opinion about this site),
    - "suspicious" (scanner thinks the site is suspicious),
    - "malicious" (scanner thinks the site is malicious).
    - engine_name: <string> complete name of the URL scanning service.
    - engine_version: <string> engine version value, in case it reports that data.
    - method: <string> type of service given by that URL scanning service (i.e. "blacklist").
    - result: <string> raw value returned by the URL scanner ("clean", "malicious", "suspicious", "phishing"). It may vary from scanner to scanner, hence the need for the "category" field for normalisation.

In [8]:
import json
print(df_final['last_analysis_results'][0])

{"Acronis":{"method":"blacklist","engine_name":"Acronis","category":"harmless","result":"clean"},"0xSI_f33d":{"method":"blacklist","engine_name":"0xSI_f33d","category":"undetected","result":"unrated"},"Abusix":{"method":"blacklist","engine_name":"Abusix","category":"harmless","result":"clean"},"ADMINUSLabs":{"method":"blacklist","engine_name":"ADMINUSLabs","category":"harmless","result":"clean"},"Axur":{"method":"blacklist","engine_name":"Axur","category":"undetected","result":"unrated"},"ChainPatrol":{"method":"blacklist","engine_name":"ChainPatrol","category":"undetected","result":"unrated"},"Criminal IP":{"method":"blacklist","engine_name":"Criminal IP","category":"undetected","result":"unrated"},"AILabs (MONITORAPP)":{"method":"blacklist","engine_name":"AILabs (MONITORAPP)","category":"harmless","result":"clean"},"AlienVault":{"method":"blacklist","engine_name":"AlienVault","category":"harmless","result":"clean"},"alphaMountain.ai":{"method":"blacklist","engine_name":"alphaMountain

In [9]:
def get_all_posible_category_names(df_last_analysis_results):
    category_names = []
    for row in df_last_analysis_results:
        data = json.loads(row)
        for engine, details in data.items():
            category = details['category']
            if category not in category_names:
                category_names.append(category)
    return category_names

print(get_all_posible_category_names(df_final['last_analysis_results']))

['harmless', 'undetected', 'malicious', 'suspicious']


The information I’m interested in from the **`last_analysis_results`** column (which is a JSON) is already captured in the columns **`stats_malicious`**, **`stats_suspicious`**, **`stats_undetected`**, and **`stats_harmless`**.  

Therefore, we will delete the **`last_analysis_results`** column.


In [10]:
def get_all_posible_results_names(df_last_analysis_results):
    category_names = []
    for row in df_last_analysis_results:
        data = json.loads(row)
        for engine, details in data.items():
            category = details['result']
            if category not in category_names:
                category_names.append(category)
    return category_names

all_posible_results_names = get_all_posible_results_names(df_final['last_analysis_results'])
all_posible_results_names

['clean',
 'unrated',
 'malicious',
 'suspicious',
 'malware',
 'phishing',
 'not recommended',
 'spam']

In [11]:
results_columns = []
for index, row in df_final.iterrows():
    json_result = json.loads(row['last_analysis_results'])
    counter = np.zeros(len(all_posible_results_names), dtype=IntegerType)
    for engine, value in json_result.items():
        idx = all_posible_results_names.index(value['result'])
        counter[idx] += 1
    results_columns.append(counter)

In [12]:
counts_df = pd.DataFrame(results_columns, columns=all_posible_results_names, index=df_final.index)
df_final = pd.concat([df_final, counts_df], axis=1)

In [13]:
df_final.head()

Unnamed: 0,root_id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,tags,threat_status,clean,unrated,malicious,suspicious,malware,phishing,not recommended,spam
0,imagebam.com,Admin City: Praha 4\nAdmin Country: CZ\nAdmin ...,0,com,GRANSY S.R.O D/B/A SUBREG.CZ,1757077000.0,1758527000.0,1726741000.0,1158911000.0,1,...,,whitelist,62,32,1,0,0,0,0,0
1,images-amazon.com,Creation Date: 2004-07-20T23:52:20+0000\nCreat...,0,com,MarkMonitor Inc.,1757080000.0,1776312000.0,1738346000.0,1090368000.0,0,...,,whitelist,65,30,0,0,0,0,0,0
2,images.dmca.com,Creation Date: 2000-05-18T12:11:02Z\nDNSSEC: u...,0,com,"GoDaddy.com, LLC",1757082000.0,2031567000.0,1732726000.0,958651900.0,0,...,,whitelist,65,30,0,0,0,0,0,0
3,images.dwell.com,Creation Date: 1999-03-18T05:00:00Z\nDNSSEC: u...,0,com,"NameCheap, Inc.",1757001000.0,1981432000.0,1665813000.0,921733200.0,0,...,,whitelist,63,32,0,0,0,0,0,0
4,images.netdirector.co.uk,Expiry date: 25-Nov-2026\nLast updated: 07-Oct...,0,co.uk,,1757068000.0,,,,0,...,,whitelist,63,32,0,0,0,0,0,0


In [14]:
# View duplicated content columns
duplicadas = df_final.T.duplicated()

# Print columns duplicated
print(duplicadas)

# Get colnames duplicated
cols_duplicadas = duplicadas[duplicadas].index.tolist()
print("Columnas duplicadas:", cols_duplicadas)

root_id                   False
whois                     False
reputation                False
tld                       False
registrar                 False
last_modification_date    False
expiration_date           False
last_update_date          False
creation_date             False
stats_malicious           False
stats_suspicious          False
stats_undetected          False
stats_harmless            False
stats_timeout             False
votes_harmless            False
votes_malicious           False
last_analysis_results     False
popularity_ranks          False
categories                False
tags                      False
threat_status             False
clean                      True
unrated                    True
malicious                 False
suspicious                False
malware                   False
phishing                  False
not recommended           False
spam                      False
dtype: bool
Columnas duplicadas: ['clean', 'unrated']


In [15]:
# Remove duplicated columns
df_final = df_final.drop(['clean', 'unrated'], axis=1)

# Drop unused column
df_final = df_final.drop('last_analysis_results', axis=1)

## popularity_ranks column

- popularity_ranks: <dictionary> domain's position in popularity ranks such as Alexa, Quantcast, Statvoo, etc. Every dictionary contains the following subfields:
    - rank: <integer> rank position.
    - timestamp: <integer> UTC timestamp when the rank was ingested.

In [16]:
import json
json_formatted_str = json.dumps(df_final['popularity_ranks'][0], indent=2)
json_formatted_str

'"{\\"Majestic\\":{\\"rank\\":8212,\\"timestamp\\":1756996686},\\"Statvoo\\":{\\"rank\\":1152,\\"timestamp\\":1684169881},\\"Alexa\\":{\\"rank\\":1152,\\"timestamp\\":1684083481},\\"Cisco Umbrella\\":{\\"rank\\":176451,\\"timestamp\\":1756996691},\\"Quantcast\\":{\\"rank\\":4872,\\"timestamp\\":1585841763},\\"Cloudflare Radar\\":{\\"rank\\":20000,\\"timestamp\\":1756996684}}"'

In [17]:
def get_popularity_ranks(json_ranks):
    data = json.loads(json_ranks)
    ranks = {row["rank"] for row in data.values()}
    return list(ranks)

get_popularity_ranks(df_final['popularity_ranks'][10])

[26255, 23430, 567]

Here we’re working with the popularity_ranks column, which is a JSON string containing rankings of a domain from different sources (like Alexa, Cisco Umbrella, etc.).

- Each source gives its own rank, and there might be duplicates or multiple ranks across sources.

What this code does:

1. Parse the JSON for each row.

2. Extract all the rank values and collect them into a set to remove duplicates.

3. Convert the set to a list and save it in a new column called ranks.

4. Drop the original popularity_ranks JSON column, since all the useful info is now summarized in ranks.

Result: for each domain, you now have a clean array of ranks from all sources, easy to work with in analysis, instead of keeping the messy nested JSON.

In [18]:
df_final["ranks"] = df_final.apply(lambda row: (get_popularity_ranks(row["popularity_ranks"])), axis=1)
df_final = df_final.drop(['popularity_ranks'], axis=1)

# Categories colum

In [19]:
df_final['categories'][1]

'{"alphaMountain.ai":"Shopping (alphaMountain.ai)","BitDefender":"onlinephotos","Sophos":"online shopping","Forcepoint ThreatSeeker":"web images"}'

In [20]:
df_final = df_final.drop('categories', axis=1)

# Final summary

In [21]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4148 entries, 0 to 4147
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   root_id                 4148 non-null   object 
 1   whois                   4051 non-null   object 
 2   reputation              4148 non-null   int32  
 3   tld                     4148 non-null   object 
 4   registrar               1950 non-null   object 
 5   last_modification_date  4143 non-null   float64
 6   expiration_date         1913 non-null   float64
 7   last_update_date        1972 non-null   float64
 8   creation_date           1973 non-null   float64
 9   stats_malicious         4148 non-null   int32  
 10  stats_suspicious        4148 non-null   int32  
 11  stats_undetected        4148 non-null   int32  
 12  stats_harmless          4148 non-null   int32  
 13  stats_timeout           4148 non-null   int32  
 14  votes_harmless          4148 non-null   

In [22]:
df_final.columns

Index(['root_id', 'whois', 'reputation', 'tld', 'registrar',
       'last_modification_date', 'expiration_date', 'last_update_date',
       'creation_date', 'stats_malicious', 'stats_suspicious',
       'stats_undetected', 'stats_harmless', 'stats_timeout', 'votes_harmless',
       'votes_malicious', 'tags', 'threat_status', 'malicious', 'suspicious',
       'malware', 'phishing', 'not recommended', 'spam', 'ranks'],
      dtype='object')

In [26]:
df_final.describe()

Unnamed: 0,reputation,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,stats_suspicious,stats_undetected,stats_harmless,stats_timeout,votes_harmless,votes_malicious,malicious,suspicious,malware,phishing,not recommended,spam
count,4148.0,4143.0,1913.0,1972.0,1973.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,0.178158,1754257000.0,1799962000.0,1725423000.0,1136875000.0,2.516635,0.344503,33.069672,58.828351,0.0,0.60704,0.327387,1.455159,0.34161,0.217936,0.843539,0.001929,0.000964
std,17.031916,13317870.0,67197280.0,36448150.0,283182100.0,3.41156,0.525846,9.357473,9.974452,0.0,8.717293,2.168866,1.76239,0.522219,0.476348,1.772268,0.043879,0.038025
min,-131.0,1555788000.0,1501509000.0,1442020000.0,511592400.0,0.0,0.0,26.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1755401000.0,1768100000.0,1722565000.0,892785600.0,0.0,0.0,30.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1756756000.0,1778620000.0,1736504000.0,1122551000.0,0.0,0.0,32.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1757070000.0,1801268000.0,1745757000.0,1362772000.0,4.0,1.0,33.0,64.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
max,642.0,1757432000.0,2350237000.0,1756837000.0,1756512000.0,15.0,3.0,95.0,69.0,0.0,447.0,74.0,7.0,3.0,4.0,8.0,1.0,2.0


In [23]:
df_final.head()

Unnamed: 0,root_id,whois,reputation,tld,registrar,last_modification_date,expiration_date,last_update_date,creation_date,stats_malicious,...,votes_malicious,tags,threat_status,malicious,suspicious,malware,phishing,not recommended,spam,ranks
0,imagebam.com,Admin City: Praha 4\nAdmin Country: CZ\nAdmin ...,0,com,GRANSY S.R.O D/B/A SUBREG.CZ,1757077000.0,1758527000.0,1726741000.0,1158911000.0,1,...,0,,whitelist,1,0,0,0,0,0,"[1152, 20000, 176451, 4872, 8212]"
1,images-amazon.com,Creation Date: 2004-07-20T23:52:20+0000\nCreat...,0,com,MarkMonitor Inc.,1757080000.0,1776312000.0,1738346000.0,1090368000.0,0,...,0,,whitelist,0,0,0,0,0,0,"[5000, 20354, 273404, 3166]"
2,images.dmca.com,Creation Date: 2000-05-18T12:11:02Z\nDNSSEC: u...,0,com,"GoDaddy.com, LLC",1757082000.0,2031567000.0,1732726000.0,958651900.0,0,...,0,,whitelist,0,0,0,0,0,0,[17899]
3,images.dwell.com,Creation Date: 1999-03-18T05:00:00Z\nDNSSEC: u...,0,com,"NameCheap, Inc.",1757001000.0,1981432000.0,1665813000.0,921733200.0,0,...,0,,whitelist,0,0,0,0,0,0,"[491648, 770756]"
4,images.netdirector.co.uk,Expiry date: 25-Nov-2026\nLast updated: 07-Oct...,0,co.uk,,1757068000.0,,,,0,...,0,,whitelist,0,0,0,0,0,0,[240790]


# Save dataframe
Save silver dataset for VirusTotal

In [24]:
df_final.to_csv('silver/virus_total/virus_total.csv', sep=';', index=False)