
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/GlennChia/50_038_cds_project/blob/master/notebooks/models/colab_gridsearcher.ipynb)

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 21.4MB/s eta 0:00:01[K     |███████▍                        | 20kB 4.1MB/s eta 0:00:01[K     |███████████                     | 30kB 5.8MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 7.4MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 4.9MB/s eta 0:00:01[K     |██████████████████████          | 61kB 5.7MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 6.5MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 7.3MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 4.7MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [0]:
from google.colab import files
import numpy as np
import pandas as pd
import glob
import joblib

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score, hamming_loss, make_scorer, accuracy_score

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split


# IMPORTANT

First, press f12 or Ctrl + Shift + I to open up console.

Paste the following code to prevent timeout.

```
function ClickConnect() {
  console.log("Working"); 
  document.querySelector("colab-toolbar-button#connect").click() 
}
setInterval(ClickConnect,60000)
```

Then run the below code and click allow multiple downloads to ensure all models get downloaded.

These temp files can be deleted after they've been run

In [3]:
!touch temp1.txt temp2.txt temp3.txt
temp_files = glob.glob("*.txt")
for file in temp_files:
    print(f'Downloading {file} ...')
    files.download(file)

Downloading temp3.txt ...
Downloading temp2.txt ...
Downloading temp1.txt ...


In [4]:
DATA_DIR = ""
INPUT_FILE_NAME = "https://github.com/GlennChia/50_038_cds_project/blob/master/data/processed/cleaned_squashed3.parquet?raw=true"
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags,squash3_tags
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...,"culture,politics,science,climate change,enviro...","culture,politics,science,global issues,environ...","culture,politics,science,global issues,environ..."
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...,"invention,engineering,design,global issues","invention,engineering,design,global issues","invention,design,global issues"
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...,"poverty,economics,culture,politics,policy,glob...","inequality,economics,culture,politics,governme...","inequality,economics,culture,politics,global i..."
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...,"invention,engineering,entrepreneur,design,busi...","invention,engineering,entrepreneur,design,busi...","invention,design,business"
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"invention,design,technology,business,art","invention,design,technology,business,art","invention,design,technology,business,art"


In [5]:
df = df.dropna(subset=['squash3_tags'])
df = df.reset_index(drop=True)
df.iloc[:,:10].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328 entries, 0 to 2327
Data columns (total 10 columns):
speaker                    2328 non-null object
headline                   2328 non-null object
description                2328 non-null object
duration                   2328 non-null object
tags                       2328 non-null object
transcript                 2328 non-null object
WC                         2328 non-null float64
clean_transcript           2328 non-null object
clean_transcript_string    2328 non-null object
squash_tags                2328 non-null object
dtypes: float64(1), object(9)
memory usage: 182.0+ KB


In [0]:
X = df['clean_transcript_string']
labels = df[['squash3_tags']]

In [0]:
y = []
for index, row in labels.iterrows():
    y.append(set(row['squash3_tags'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [8]:
print(encoded_y[0])
print(len(encoded_y[0]))
print(mlb.inverse_transform(encoded_y)[:10])

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1]
26
[('culture', 'environment', 'global issues', 'politics', 'science', 'technology'), ('design', 'global issues', 'invention'), ('business', 'culture', 'economics', 'global issues', 'inequality', 'politics'), ('business', 'design', 'invention'), ('art', 'business', 'design', 'invention', 'technology'), ('biodiversity', 'invention', 'science', 'technology'), ('entertainment', 'music', 'technology'), ('collaboration', 'culture', 'design'), ('business', 'culture', 'education', 'global issues', 'invention', 'science', 'technology'), ('culture', 'global issues', 'science')]


In [0]:
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(X.values.reshape(len(X.values), 1), encoded_y, test_size = 0.2)
X_train = pd.DataFrame(X_train)[0]
X_test = pd.DataFrame(X_test)[0]

In [10]:
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

[108 209 171 115 263 105 130 142 105 868 320 123 164 228 115 174 543 101
 128  91 104 117  99 146 684 630]
[ 39  52  44  33  66  30  33  43  31 238  80  31  42  57  40  44 136  34
  36  37  32  39  27  37 184 157]


## Gridsearch for the best single model for all labels

### References 
http://scikit.ml/api/skmultilearn.problem_transform.br.html

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

http://scikit.ml/stratification.html

https://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier/12637528#12637528

### Binary Relevance

In [0]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Get scoring function to work, hamming? -- kinda done
# 3. Balanced class labels
# 4. Set better param ranges
# 5. Remove vectorizer step once we decide on which is better, then use sparse csr and hopefully it trains faster

param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
    'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
    'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__classifier': [LogisticRegression()],
        'clf__classifier__penalty': ['l1', 'l2'],
        'clf__classifier__C': param_range_lr,
        'clf__classifier__solver': ['liblinear'],
        'clf__classifier__class_weight': ['balanced'],
}

svc_params = {
    'clf__classifier': [SVC()],
        'clf__classifier__kernel': ['linear', 'rbf'],
        'clf__classifier__C': param_range, # np.logspace(-1, 2, 10),
        'clf__classifier__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__classifier__probability': [True],
        'clf__classifier__class_weight': ['balanced'],
}

rf_params = {
    'clf__classifier': [RandomForestClassifier()],
        'clf__classifier__criterion': ['gini', 'entropy'],
        'clf__classifier__min_samples_leaf': param_range,
        'clf__classifier__max_depth': param_range,
        'clf__classifier__min_samples_split': param_range[1:],
        'clf__classifier__n_estimators': [10],
        'clf__classifier__class_weight': ['balanced'],
}

mnb_params = {
    'clf__classifier': [MultinomialNB()],
        'clf__classifier__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
    {**vectorizer_params, **svc_params},
    {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

br_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', BinaryRelevance()),
                       ]
                      )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
# scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss)
# scoring = 'neg_log_loss'
scoring = 'f1_samples'
folds = 5
njobs = -1

br_model = GridSearchCV(br_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [0]:
%%time
br_model.fit(X_train,y_train)
print(br_model.best_params_, br_model.best_score_)
pd.DataFrame(br_model.cv_results_)



{'clf__classifier': LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__classifier__C': 1.0, 'clf__classifier__class_weight': 'balanced', 'clf__classifier__penalty': 'l2', 'clf__classifier__solver': 'liblinear', 'tfidf__use_idf': True} 0.5135059678386944
CPU times: user 18.4 s, sys: 4.69 s, total: 23.1 s
Wall time: 5min 45s


In [0]:
y_pred_prob = br_model.predict_proba(X_test)
t = 0.2 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"Binary relevance best model's f1-score {score}")

Binary relevance best model's f1-score 0.2394831730769231


In [0]:
filename = 'best_br_model.joblib'
joblib.dump(br_model, filename)
files.download(filename)

### OneVsRest

In [0]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Use proper scoring function - ideally, predicting relevant labels should be more important than predicting irrelevant ones
# 3. Balanced class labels
# 4. Set better param ranges

param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
    'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
    'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__estimator': [LogisticRegression()],
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__C': param_range_lr,
        'clf__estimator__solver': ['liblinear'],
        'clf__estimator__class_weight': ['balanced'],
}

svc_params = {
    'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['linear', 'rbf'],
        'clf__estimator__C': param_range, # np.logspace(-1, 2, 10),
        'clf__estimator__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__estimator__probability': [True],
        'clf__estimator__class_weight': ['balanced'],
}

rf_params = {
    'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__criterion': ['gini', 'entropy'],
        'clf__estimator__min_samples_leaf': param_range,
        'clf__estimator__max_depth': param_range,
        'clf__estimator__min_samples_split': param_range[1:],
        'clf__estimator__n_estimators': [10],
        'clf__estimator__class_weight': ['balanced'],
}

mnb_params = {
    'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
    {**vectorizer_params, **svc_params},
    {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

ovr_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', OneVsRestClassifier(LogisticRegression())),
                        ]
                       )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss) # hamming gives equal weighting to both relevant and irrelevant?
# maybe use precision somewhere
folds = 5
njobs = -1

ovr_model = GridSearchCV(ovr_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [0]:
%%time
ovr_model.fit(X_train,y_train)
print(ovr_model.best_params_, ovr_model.best_score_)
pd.DataFrame(ovr_model.cv_results_)

{'clf__estimator': MultinomialNB(alpha=0.7, class_prior=None, fit_prior=True), 'clf__estimator__alpha': 0.7, 'tfidf__use_idf': True} 0.15590712409736715
CPU times: user 1.52 s, sys: 143 ms, total: 1.66 s
Wall time: 14.2 s


In [0]:
y_pred_prob = ovr_model.predict_proba(X_test)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"One vs Rest best model's f1-score {score}")

One vs Rest best model's f1-score 0.35597592433361996


In [0]:
filename = 'best_ovr_model.joblib'
joblib.dump(ovr_model, filename)
files.download(filename)

## Gridsearch best model for each tag

https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search



In [0]:
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
    'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
    'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

# Add any Binary classification model setting here.
# Also add to general parameters to be passed into pipeline below if want to use new model.

lr_params = {
    'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_lr,
        'clf__solver': ['liblinear'],
        'clf__class_weight': ['balanced'],
}

svc_params = {
    'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf'],
        'clf__C': param_range, # np.logspace(-1, 2, 10),
        'clf__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__probability': [True],
        'clf__class_weight': ['balanced'],
}

rf_params = {
    'clf': [RandomForestClassifier()],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:],
        'clf__n_estimators': [15],
        'clf__class_weight': ['balanced'],
}

mnb_params = {
    'clf': [MultinomialNB()],
        'clf__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
    {**vectorizer_params, **svc_params},
    {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

per_tag_pipe = Pipeline([('vectorizer', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('clf', LogisticRegression())])

scoring = 'f1'
# scoring = 'balanced_accuracy'
# scoring = 'precision'
folds = 10
njobs = -1

per_tag_model = GridSearchCV(per_tag_pipe, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [0]:
tags = [tag for tag in mlb.inverse_transform(np.ones(shape=(1, 26)))[0]]
print(tags)
tags.index('technology')

['activism', 'art', 'biodiversity', 'brain', 'business', 'children', 'collaboration', 'communication', 'community', 'culture', 'design', 'economics', 'education', 'entertainment', 'environment', 'future', 'global issues', 'history', 'humanity', 'inequality', 'invention', 'life', 'music', 'politics', 'science', 'technology']


25

In [0]:
%%time

for index in range(len(tags)):
    print(f"Processing {tags[index]}")
    per_tag_model.fit(X_train, y_train[:, index])
#     display(pd.DataFrame(per_tag_model.cv_results_))
    t = 0.2 #threshold value
    prediction_prob = per_tag_model.predict_proba(X_test)
    prediction = (prediction_prob[:, 1] >= t).astype(int)
    # save model or model params somewhere
    print(f'tag {index}: {tags[index]} best model {per_tag_model.best_params_}')
    print(f'tag {index}: {tags[index]} counts - predicted: {sum(prediction)}, actual: {sum(y_test[:, index])}')
    print(f'tag {index}: {tags[index]} test f1-score is {f1_score(y_test[:, index], prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} test accuracy is {accuracy_score(y_test[:, index], prediction)}')
    filename = f'best_{tags[index]}_model.joblib'
    joblib.dump(per_tag_model, filename)
    print('--------------------------')

Processing activism




tag 0: activism best model {'clf': RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=15, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False), 'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__min_samples_split': 2, 'clf__n_estimators': 15, 'tfidf__use_idf': True}
tag 0: activism counts - predicted: 59, actual: 38
tag 0: activism test f1-score is 0.20618556701030927
tag 0: activism test accuracy is 0.8415637860082305
--------------------------
Processing art
tag 1: art best model {'clf': RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entro

In [0]:
rf_models = glob.glob("*.joblib")
print(rf_models)

['best_rf_economics.joblib', 'best_rf_art.joblib', 'best_rf_life.joblib', 'best_rf_biodiversity.joblib', 'best_rf_future.joblib', 'best_rf_environment.joblib', 'best_rf_activism.joblib', 'best_rf_education.joblib', 'best_rf_culture.joblib', 'best_rf_business.joblib', 'best_rf_collaboration.joblib', 'best_rf_music.joblib', 'best_rf_global issues.joblib', 'best_rf_science.joblib', 'best_rf_community.joblib', 'best_rf_brain.joblib', 'best_rf_inequality.joblib', 'best_rf_invention.joblib', 'best_rf_children.joblib', 'best_rf_humanity.joblib', 'best_rf_entertainment.joblib', 'best_rf_politics.joblib', 'best_rf_communication.joblib', 'best_rf_design.joblib', 'best_rf_technology.joblib', 'best_rf_history.joblib']


In [0]:
for index, file in enumerate(rf_models):
    print(f'Downloading model {index + 1}: {file} ...')
    files.download(file)