In [56]:
import pandas as pd
import json
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

In [2]:
with open('/Users/adam/phd/projects/certificates/certificate_data_complete_processed.json', 'r') as json_file:
    data = json.load(json_file)

### Load json features into pandas dataframe

In [3]:
# Create all features and fill a dataframe with them
feature_dict = {x: [] for x in ['category', 'scheme', 'n_updates', 'sec_level', 'cert_lab', 'n_pages',
                           'cert_date', 'archived_date', 'manufacturer']}
for key, val in data.items():
    cert = data[key]
    feature_dict['category'].append(cert['csv_scan'].get('cc_category', np.nan))
    feature_dict['scheme'].append(cert['csv_scan'].get('cc_scheme', np.nan))
    feature_dict['n_updates'].append(len(cert['csv_scan'].get('maintainance_updates', np.nan)))
    feature_dict['sec_level'].append(cert['csv_scan'].get('cc_security_level', np.nan))
    feature_dict['cert_lab'].append(cert['processed'].get('cert_lab', np.nan))
    feature_dict['cert_date'].append(cert['csv_scan'].get('cc_certification_date', np.nan))
    feature_dict['archived_date'].append(cert['csv_scan'].get('cc_archived_date', np.nan))
    feature_dict['manufacturer'].append(cert['processed'].get('cc_manufacturer_simple', np.nan))

    meta_scan = cert.get('pdfmeta_scan') or {}
    n_pages = meta_scan.get('pdf_number_of_pages', -1)
    feature_dict['n_pages'].append(n_pages)
df = pd.DataFrame(data=feature_dict)

### Pre-process features

In [27]:
# Convert date features to datetime data type
df.cert_date = pd.to_datetime(df.cert_date)
df.archived_date = pd.to_datetime(df.archived_date)
df['cert_year'] = df.cert_date.dt.year

# Drop rows that didn't yet been archived
df = df.loc[~df.archived_date.isnull(), :]

# Select timedelta in months between cert date and archived date
df['n_days_valid'] = df.archived_date - df.cert_date

df.sec_level = df.sec_level.map(lambda x: x.split(',')[0])

### Prepare training and test set

In [39]:
features = df.loc[:, ['category', 'scheme', 'n_updates', 'sec_level', 'n_pages', 'cert_year', 'n_days_valid']]

features.category = features.category.astype('category').cat.codes
features.scheme = features.scheme.astype('category').cat.codes
features.sec_level = features.sec_level.astype('category').cat.codes
features.n_days_valid = features.n_days_valid.apply(lambda x: float(x.days))

labels = np.array(features['n_days_valid']) # we can also predict sec_level for instance
features.drop('sec_level', axis=1, inplace=True)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (2679, 6)
Training Labels Shape: (2679,)
Testing Features Shape: (670, 6)
Testing Labels Shape: (670,)


### Build RandomForest classifier / regressor

In [37]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
acc = accuracy_score(predictions, test_labels, normalize=True)
print(acc)

0.491044776119403


In [58]:
rf = RandomForestRegressor(n_estimators=70)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
mae = mean_absolute_error(predictions, test_labels)
mse = mean_squared_error(predictions, test_labels)
print(f'MAE: {mae}\nMSE: {mse}')

MAE: 2.2754797441364496
MSE: 80.11295278708458


In [62]:
list(rf.feature_importances_)

[7.189824593529989e-06,
 4.314906116538629e-06,
 4.602625812677683e-07,
 1.359120774200665e-05,
 9.330495569624062e-05,
 0.9998811388432708]