# **Initialisation**
Initialisation de la base de données spark

In [19]:
import findspark

findspark.init()

import spark


In [20]:
import configparser

config = configparser.ConfigParser()

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local").appName("Import").getOrCreate()

# **Extraction**
Premier regard sur la base de données 

In [22]:
df = spark.read.option("multiline","true").json("/home/marins/3A-ENSTA/Big data project/CVE_Analysis/src/nvdcve-1.1-2016.json")

print("Schema initial : ")
df.printSchema()

df = df.select(explode(col("CVE_Items")))


Schema initial : 
root
 |-- CVE_Items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- configurations: struct (nullable = true)
 |    |    |    |-- CVE_data_version: string (nullable = true)
 |    |    |    |-- nodes: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- children: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- children: array (nullable = true)
 |    |    |    |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |    |    |    |-- cpe_match: array (nullable = true)
 |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |-- cpe23Uri: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- cpe_name: array (nullable = true)
 |    |    |    |    |    |    |   

Extraction des données pertinents via select(). 

In [23]:
df = df.select("col.cve.CVE_data_meta.*","col.impact.baseMetricV3.cvssV3.*").drop("version").drop("vectorString")

print("Columns : ",df.columns)
#df.printSchema()

Columns :  ['ASSIGNER', 'ID', 'attackComplexity', 'attackVector', 'availabilityImpact', 'baseScore', 'baseSeverity', 'confidentialityImpact', 'integrityImpact', 'privilegesRequired', 'scope', 'userInteraction']


Affichage des premières lignes et du nombre de samples à disposition pour se faire une idée.

In [24]:
print("Number of available rows : ", df.count())
df.show(5)

                                                                                

Number of available rows :  10510
+--------------------+-------------+----------------+------------+------------------+---------+------------+---------------------+---------------+------------------+---------+---------------+
|            ASSIGNER|           ID|attackComplexity|attackVector|availabilityImpact|baseScore|baseSeverity|confidentialityImpact|integrityImpact|privilegesRequired|    scope|userInteraction|
+--------------------+-------------+----------------+------------+------------------+---------+------------+---------------------+---------------+------------------+---------+---------------+
|       cve@mitre.org|CVE-2016-0001|            null|        null|              null|     null|        null|                 null|           null|              null|     null|           null|
|secure@microsoft.com|CVE-2016-0002|            HIGH|     NETWORK|              HIGH|      7.5|        HIGH|                 HIGH|           HIGH|              NONE|UNCHANGED|       REQUIRED|
|secur

Essai de la fonction tail pour récupérer des données. Pas idéal. La fonction "collect()" semble bien plus adaptée.

In [25]:
print(df.tail(1)) 

[Row(ASSIGNER='cve@mitre.org', ID='CVE-2016-9998', attackComplexity='LOW', attackVector='NETWORK', availabilityImpact='NONE', baseScore=6.1, baseSeverity='MEDIUM', confidentialityImpact='LOW', integrityImpact='LOW', privilegesRequired='NONE', scope='CHANGED', userInteraction='REQUIRED')]


                                                                                

# Machine Learning Prediction
Tentative de prédiction du baseScore d'une CVE via les données cvssV3 ingurgitées.


In [26]:
import pandas as pd
pandasDF = df.toPandas()
print(pandasDF.columns)

print(pandasDF.shape[0])

pandasDF = pandasDF[pandasDF['baseScore'].notna()]

print(pandasDF.shape[0])


pandasDF.head(3)

                                                                                

Index(['ASSIGNER', 'ID', 'attackComplexity', 'attackVector',
       'availabilityImpact', 'baseScore', 'baseSeverity',
       'confidentialityImpact', 'integrityImpact', 'privilegesRequired',
       'scope', 'userInteraction'],
      dtype='object')
10510
9028


Unnamed: 0,ASSIGNER,ID,attackComplexity,attackVector,availabilityImpact,baseScore,baseSeverity,confidentialityImpact,integrityImpact,privilegesRequired,scope,userInteraction
1,secure@microsoft.com,CVE-2016-0002,HIGH,NETWORK,HIGH,7.5,HIGH,HIGH,HIGH,NONE,UNCHANGED,REQUIRED
2,secure@microsoft.com,CVE-2016-0003,LOW,NETWORK,HIGH,9.6,CRITICAL,HIGH,HIGH,NONE,CHANGED,REQUIRED
4,secure@microsoft.com,CVE-2016-0005,LOW,NETWORK,NONE,4.3,MEDIUM,NONE,LOW,NONE,UNCHANGED,REQUIRED


In [27]:
target_column = "baseScore"
target = pandasDF[target_column]

categorical_columns = ["attackComplexity",
                       "attackVector",
                       "availabilityImpact",
                       "confidentialityImpact",
                       "integrityImpact",
                       "privilegesRequired",
                       "scope",
                       "userInteraction"]

data = pandasDF[categorical_columns]


Affichage des types des différentes colonnes

In [28]:
target.dtypes

dtype('float64')

In [29]:
data.dtypes

attackComplexity         object
attackVector             object
availabilityImpact       object
confidentialityImpact    object
integrityImpact          object
privilegesRequired       object
scope                    object
userInteraction          object
dtype: object

In [30]:
import numpy as np
#pd.set_option("display.max_rows", None)

target.value_counts().sort_index()

len(np.where(np.isnan(target))[0])

#pandasDF.loc[116]

0

In [31]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Onehot ==> Linéaire
# Ordinal = Treebased

model = make_pipeline(OneHotEncoder(),LinearRegression())
model

In [32]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv = 10)


In [34]:
cv_results["test_score"].mean()

0.9554594699294275