In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, plot_confusion_matrix, confusion_matrix
from sklearn.inspection import permutation_importance
import os

plt.style.use('seaborn')

def plot_feature_importances(model, x_test, y_test):
    result = permutation_importance(model, x_test, y_test, n_repeats=20,
                                random_state=42, n_jobs=8)
    sorted_idx = result.importances_mean.argsort()

    fig, ax = plt.subplots(figsize=(12,8))
    ax.boxplot(result.importances[sorted_idx].T,
               vert=False, labels=x_test.columns[sorted_idx])
    ax.set_title("Permutation Importances (test set)")
    fig.tight_layout()
    plt.show()

# Workflow

## Optional steps done once per new dataset version

1. Load dataset from json to dataframe while extracting features
2. Preprocess: Fill-in missing values, categorize numerical features
3. Split the dataset into train and test parts, save them into respective files

## Exploratory data analysis

1. Load both train and test dataset from the files
2. Do whatever you want. Don't forget that when looking at target variable, stay only on the train dataset

In [None]:
# Run the pre-processing step (creates dataset)
#import import_ipynb
#import data_preprocessing

# Load the dataset
train = pd.read_csv('/Users/adam/phd/projects/certificates/dataset/train.csv', index_col='index')
test = pd.read_csv('/Users/adam/phd/projects/certificates/dataset/test.csv', index_col='index')


In [None]:
train.head()

### Basic visualisations

In [None]:
# Show histogram
%matplotlib inline
train.hist(bins=20, figsize=(20, 15))
plt.show()

In [None]:
# Show correlations between numerical features and the target feature
train.corr()['sec_level_cat'].sort_values(ascending=False)

In [None]:
train.columns