# **Initialisation**
Initialisation de la base de données spark

In [None]:
import os
import findspark

findspark.init()

import spark


In [None]:
import configparser

config = configparser.ConfigParser()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local").appName("Import").getOrCreate()

# **Extraction**
Premier regard sur la base de données 

In [None]:
df = spark.read.option("multiline","true").json("/home/marins/3A-ENSTA/Big data project/nvdcve-1.1-2022.json")
df = df.select(explode(col("CVE_Items")))
df = df.select("col.cve.CVE_data_meta.*","col.impact.baseMetricV3.cvssV3.*").drop("version").drop("vectorString")

json_files = os.listdir("/home/marins/3A-ENSTA/Big data project/CVE_Analysis/src/")
for json in json_files:
      if json_files != 'nvdcve-1.1-2022.json' :
            print("Importing", json,"...")
            df_temp = spark.read.option("multiline","true").json("/home/marins/3A-ENSTA/Big data project/CVE_Analysis/src/"+json)
            df_temp = df_temp.select(explode(col("CVE_Items")))
            df_temp = df_temp.select("col.cve.CVE_data_meta.*","col.impact.baseMetricV3.cvssV3.*").drop("version").drop("vectorString")
            df = df.union(df_temp)


print("Schema initial : ")
df.printSchema()


Extraction des données pertinents via select(). 

In [None]:
print("Columns : ",df.columns)
#df.printSchema()

Affichage des premières lignes et du nombre de samples à disposition pour se faire une idée.

In [None]:
print(df.tail(1)) 

# Machine Learning Prediction
Tentative de prédiction du baseScore d'une CVE via les données cvssV3 ingurgitées.


In [None]:
import pandas as pd
pandasDF = df.toPandas()
print(pandasDF.columns)

print(pandasDF.shape[0])

pandasDF = pandasDF[pandasDF['baseScore'].notna()]

print(pandasDF.shape[0])


pandasDF.head(3)

In [None]:
target_column = "baseScore"
target = pandasDF[target_column]

categorical_columns = ["attackComplexity",
                       "attackVector",
                       "availabilityImpact",
                       "confidentialityImpact",
                       "integrityImpact",
                       "privilegesRequired",
                       "scope",
                       "userInteraction"]

data = pandasDF[categorical_columns]


Affichage des types des différentes colonnes

In [None]:
target.dtypes

In [None]:
data.dtypes

In [None]:
import numpy as np
#pd.set_option("display.max_rows", None)

target.value_counts().sort_index()

len(np.where(np.isnan(target))[0])


In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor # very good
from sklearn.linear_model import LinearRegression # very good
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.pipeline import make_pipeline

# Onehot ==> Linéaire
# Ordinal = Treebased

model = make_pipeline(OneHotEncoder(sparse=False),LinearRegression())
model

In [None]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv = 5)


In [None]:
cv_results["test_score"].mean()

# Plotting 

Une bonne doc pour expliquer les paramètres du score v3 : https://www.first.org/cvss/specification-document

https://nvd.nist.gov/vuln-metrics/cvss/v3-calculator

In [None]:
import numpy as np
import plotly.express as px
import numpy as np

# def severity_scorer(severity):
#     print(severity)
#     if severity=="None":
#         return 4
#     if severity=='CRITICAL':
#         return 0
#     elif severity=='HIGH':
#         return 1
#     elif severity=='MEDIUM':
#         return 2
#     elif severity=='LOW':
#         return 3

# severity_column = ["baseSeverity"]


plotready_PandasDF = pandasDF[categorical_columns+[target_column]]
plotready_PandasDF['baseScore'] = plotready_PandasDF["baseScore"].apply(np.ceil)
plotready_PandasDF = plotready_PandasDF.sort_values(by=['baseScore'], ascending=False)

plotready_PandasDF = plotready_PandasDF.reset_index()
plotready_PandasDF = plotready_PandasDF[["attackComplexity",
                                        "attackVector",
                                        "privilegesRequired",
                                        "scope",
                                        "userInteraction",
                                        "availabilityImpact",
                                        "integrityImpact",
                                        "confidentialityImpact",
                                        "baseScore"]]



In [None]:
fig = px.parallel_categories(
    plotready_PandasDF, 
    labels={"baseScore": "Score CVSSV3",
                "attackComplexity": "Complexité", "attackVector": "Vecteur d'attaque",
                "availabilityImpact": "Impact : Disponibilité", "confidentialityImpact": "Impact : Confidentialité",
                "integrityImpact": "Impact : Intégrité" , "privilegesRequired": "Privilèges nécessaires", "userInteraction" : "Interaction utilisateur",
                "baseSeverity" : "Sévérité", "scope" : "Etendue"},
    color="baseScore",
    color_continuous_scale=px.colors.sequential.Viridis,
    height=700,
    width=1700
)
fig.show()