In [None]:
#Importing the pandas module for data reading , sklearn for machine learning and the joblib module for saving the model
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import joblib

# Load the CSV File for model- NIST 2020 cve file 
df = pd.read_csv('output_cve_2020.csv')

# drop vector string, not needed in dataset for now
df = df.drop('VectorString', axis=1)


# for non numerical values for target variable
df['BaseScore'].fillna(df['BaseScore'].mean(), inplace=True)

# drop base score in the CSV file
X = df.drop('BaseScore', axis=1)
y = df['BaseScore']

# set preprocessing for the numeric and then category features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
# setting average imputer for pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
# using most frequenty for pipeline transformer 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#for processing the numerical and categorical 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with a RandomForestRegressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', RandomForestRegressor())])

# Define the hyperparameter for regression grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Perform grid search to find the best hyperparameters in this section 
print("Starting grid search")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)
print("Ending grid search")

# Get the best model for this dataset
best_model = grid_search.best_estimator_

# Save the final model to a file
joblib.dump(best_model, 'final_model_CVE.joblib')

print("Training completed.")


Starting grid search


In [None]:
import pandas as pd
import joblib

# Load the trained model
final_model = joblib.load('final_model_CVE.joblib')

# Prepare input data
new_data = pd.DataFrame({
    'ID': ['CVE-2020-10072'],
    'Version': [2.0],
    'AccessVector': ['LOCAL'],
    'AccessComplexity': ['LOW'],
    'Authentication': ['NONE'],
    'ConfidentialityImpact': ['PARTIAL'],
    'IntegrityImpact': ['PARTIAL'],
    'AvailabilityImpact': ['PARTIAL'],
    'BaseScore': [0],
    'Severity': ['HIGH'],
    'ExploitabilityScore': [0],
    'ImpactScore': [0],
    'ACInsufInfo': [False],
    'ObtainAllPrivilege': [False],
    'ObtainUserPrivilege': [True],
    'ObtainOtherPrivilege': [False],
    'UserInteractionRequired': [False]
})

# Make 'ID' a regular column instead of an index
# new_data.set_index('ID', inplace=True)  # Remove this line

# Make predictions
vulnerability_score = final_model.predict(new_data)
print(f'Predicted Vulnerability Score: {vulnerability_score[0]}')
