### Parsing and Baseline

**Importing Libraries**

In [1]:
!pip install scikit-multilearn



In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import sparse
import funcs
from skmultilearn.adapt import BRkNNaClassifier
from skmultilearn.adapt import MLkNN
import matplotlib.pyplot as plt

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time

from sklearn.metrics import label_ranking_average_precision_score, make_scorer

from itertools import product

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv('train.csv')
val = pd.read_csv('dev.csv')

In [4]:
#Train formatting
x_train, y_train = funcs.data_format(data)
#Turns dicts into sparse matrices
x_train_s, y_train_s = funcs.sparsify(x_train, y_train)

In [5]:
#same for val
x_val, y_val = funcs.data_format(val)
x_val_s, y_val_s = funcs.sparsify(x_val, y_val)

In [7]:
md = [150, 100, 50, 10]
ms = [4, 8, 16, 32]
ml = [1, 2, 3]

paramSet = []

for max_depth, min_samples_split, min_samples_leaf in product(md, ms, ml):
    
    print('Model Parameters: ')
    print('  max_depth = ', max_depth, 
          ', min_samples_split = ', min_samples_split, 
          ', min_samples_leaf = ', min_samples_leaf, sep='')

    classifier = BinaryRelevance(
        classifier = RandomForestClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf),
        require_dense = [False, True]
    )

    classifier.fit(x_train_s, y_train_s)
    y_hat = classifier.predict(x_val_s)
    
    metric = label_ranking_average_precision_score(y_val_s.toarray(), y_hat.toarray())
    
    print('  LRAP = ', metric, sep='')
    
    paramSet.append([max_depth, min_samples_split, min_samples_leaf, metric])

Model Parameters: 
 max_depth = 10, min_samples_split = 4, min_samples_leaf = 1
Model Parameters: 
 max_depth = 10, min_samples_split = 4, min_samples_leaf = 2
Model Parameters: 
 max_depth = 10, min_samples_split = 4, min_samples_leaf = 3
Model Parameters: 
 max_depth = 10, min_samples_split = 8, min_samples_leaf = 1
Model Parameters: 
 max_depth = 10, min_samples_split = 8, min_samples_leaf = 2
Model Parameters: 
 max_depth = 10, min_samples_split = 8, min_samples_leaf = 3
Model Parameters: 
 max_depth = 10, min_samples_split = 16, min_samples_leaf = 1
Model Parameters: 
 max_depth = 10, min_samples_split = 16, min_samples_leaf = 2
Model Parameters: 
 max_depth = 10, min_samples_split = 16, min_samples_leaf = 3
Model Parameters: 
 max_depth = 10, min_samples_split = 32, min_samples_leaf = 1


KeyboardInterrupt: 

In [None]:
metricVals = [item[0] for item in paramSet]
bestParams = paramSet.index(max(metricVals))
print('Best Parameters: \n')
print('> max_depth =', bestParams[0])
print('> min_samples_split =', bestParams[1])
print('> min_samples_leaf =', bestParams[2])