In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score, hamming_loss, make_scorer, accuracy_score

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split


In [2]:
DATA_DIR = "../../data/processed/"
INPUT_FILE_NAME = 'cleaned_squashed1.parquet'
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...,"culture,politics,science,climate change,enviro..."
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...,"invention,engineering,design,global issues"
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...,"poverty,economics,culture,politics,policy,glob..."
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...,"invention,engineering,entrepreneur,design,busi..."
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"invention,design,technology,business,art"


In [3]:
df = df.dropna(subset=['squash_tags'])
df = df.reset_index(drop=True)
df.iloc[:,:10].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2378 entries, 0 to 2377
Data columns (total 10 columns):
speaker                    2378 non-null object
headline                   2378 non-null object
description                2378 non-null object
duration                   2378 non-null object
tags                       2378 non-null object
transcript                 2378 non-null object
WC                         2378 non-null float64
clean_transcript           2378 non-null object
clean_transcript_string    2378 non-null object
squash_tags                2378 non-null object
dtypes: float64(1), object(9)
memory usage: 185.9+ KB


In [4]:
X = df['clean_transcript_string']
labels = df[['squash_tags']]

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

y = []
for index, row in labels.iterrows():
    y.append(set(row['squash_tags'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [6]:
print(encoded_y[0])
print(len(encoded_y[0]))
print(mlb.inverse_transform(encoded_y)[:10])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0]
100
[('climate change', 'culture', 'environment', 'global issues', 'politics', 'science', 'sustainability', 'technology'), ('design', 'engineering', 'global issues', 'invention'), ('business', 'culture', 'economics', 'entrepreneur', 'global development', 'global issues', 'policy', 'politics', 'poverty'), ('business', 'design', 'engineering', 'entrepreneur', 'invention'), ('art', 'business', 'design', 'invention', 'technology'), ('biodiversity', 'biology', 'biotech', 'ecology', 'entrepreneur', 'genetics', 'invention', 'oceans', 'science', 'technology'), ('computers', 'entertainment', 'media', 'music', 'performance', 'technology'), ('architecture', 'cities', 'collaboration', 'culture', 'design'), ('business', 'education', 'innovation', 'invention', 'robots', 'science', 'social change', '

In [7]:
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(X.values.reshape(len(X.values), 1), encoded_y, test_size = 0.5)
X_train = pd.DataFrame(X_train)[0]
X_test = pd.DataFrame(X_test)[0]

## Gridsearch for the best single model for all labels

### References 
http://scikit.ml/api/skmultilearn.problem_transform.br.html

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

http://scikit.ml/stratification.html

https://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier/12637528#12637528

### Binary Relevance

In [8]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Get scoring function to work, hamming? -- kinda done
# 3. Balanced class labels
# 4. Set better param ranges
# 5. Remove vectorizer step once we decide on which is better, then use sparse csr and hopefully it trains faster

# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__classifier': [LogisticRegression()],
        'clf__classifier__penalty': ['l1', 'l2'],
        'clf__classifier__C': param_range_lr,
        'clf__classifier__solver': ['liblinear']
}

svc_params = {
    'clf__classifier': [SVC()],
        'clf__classifier__kernel': ['linear', 'rbf'],
        'clf__classifier__C': param_range, # np.logspace(-1, 2, 10),
        'clf__classifier__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__classifier__probability': [True],
}

rf_params = {
    'clf__classifier': [RandomForestClassifier()],
        'clf__classifier__criterion': ['gini', 'entropy'],
        'clf__classifier__min_samples_leaf': param_range,
        'clf__classifier__max_depth': param_range,
        'clf__classifier__min_samples_split': param_range[1:],
        'clf__classifier__n_estimators': [10],
}

mnb_params = {
    'clf__classifier': [MultinomialNB()],
        'clf__classifier__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
#     {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
#     {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

br_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', BinaryRelevance()),
                       ]
                      )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
# scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss)
# scoring = 'neg_log_loss'
scoring = 'f1_samples'
folds = 3
njobs = -1

br_model = GridSearchCV(br_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [9]:
%%time
br_model.fit(X_train,y_train)
print(br_model.best_params_, br_model.best_score_)
pd.DataFrame(br_model.cv_results_)

{'clf__classifier': MultinomialNB(alpha=0.7, class_prior=None, fit_prior=True), 'clf__classifier__alpha': 0.7, 'tfidf__use_idf': True} 0.0
Wall time: 2min 33s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__classifier,param_clf__classifier__alpha,param_tfidf__use_idf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,48.060127,4.288809,18.222782,2.329696,"MultinomialNB(alpha=0.7, class_prior=None, fit...",0.7,True,"{'clf__classifier': MultinomialNB(alpha=0.7, c...",0.0,0.0,0.0,0.0,0.0,1
1,58.619017,3.21935,14.699179,1.495346,"MultinomialNB(alpha=0.7, class_prior=None, fit...",0.7,False,"{'clf__classifier': MultinomialNB(alpha=0.7, c...",0.0,0.0,0.0,0.0,0.0,1
2,47.445024,10.882486,12.85093,0.220582,"MultinomialNB(alpha=0.7, class_prior=None, fit...",1.0,True,"{'clf__classifier': MultinomialNB(alpha=0.7, c...",0.0,0.0,0.0,0.0,0.0,1
3,30.636636,0.610798,11.925782,0.590671,"MultinomialNB(alpha=0.7, class_prior=None, fit...",1.0,False,"{'clf__classifier': MultinomialNB(alpha=0.7, c...",0.0,0.0,0.0,0.0,0.0,1


In [10]:
y_pred_prob = br_model.predict_proba(X_test)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"Binary relevance best model's f1-score {score}")

Binary relevance best model's f1-score 0.01589319771137953


### OneVsRest

In [11]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Use proper scoring function - ideally, predicting relevant labels should be more important than predicting irrelevant ones
# 3. Balanced class labels
# 4. Set better param ranges

# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__estimator': [LogisticRegression()],
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__C': param_range_lr,
        'clf__estimator__solver': ['liblinear']
}

svc_params = {
    'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['linear', 'rbf'],
        'clf__estimator__C': param_range, # np.logspace(-1, 2, 10),
        'clf__estimator__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__estimator__probability': [True],
}

rf_params = {
    'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__criterion': ['gini', 'entropy'],
        'clf__estimator__min_samples_leaf': param_range,
        'clf__estimator__max_depth': param_range,
        'clf__estimator__min_samples_split': param_range[1:],
        'clf__estimator__n_estimators': [10],
}

mnb_params = {
    'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
#     {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
#     {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

ovr_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', OneVsRestClassifier(LogisticRegression())),
                        ]
                       )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss) # hamming gives equal weighting to both relevant and irrelevant?
# maybe use precision somewhere
folds = 3
njobs = -1

ovr_model = GridSearchCV(ovr_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [12]:
%%time
ovr_model.fit(X_train,y_train)
print(ovr_model.best_params_, ovr_model.best_score_)
pd.DataFrame(ovr_model.cv_results_)

{'clf__estimator': MultinomialNB(alpha=0.7, class_prior=None, fit_prior=True), 'clf__estimator__alpha': 0.7, 'tfidf__use_idf': True} 0.0
Wall time: 15.3 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_clf__estimator__alpha,param_tfidf__use_idf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.917587,0.115933,1.192995,0.058628,"MultinomialNB(alpha=0.7, class_prior=None, fit...",0.7,True,"{'clf__estimator': MultinomialNB(alpha=0.7, cl...",0.0,0.0,0.0,0.0,0.0,1
1,3.578013,0.586987,1.206009,0.099401,"MultinomialNB(alpha=0.7, class_prior=None, fit...",0.7,False,"{'clf__estimator': MultinomialNB(alpha=0.7, cl...",0.0,0.0,0.0,0.0,0.0,1
2,3.442348,0.663231,1.074932,0.306867,"MultinomialNB(alpha=0.7, class_prior=None, fit...",1.0,True,"{'clf__estimator': MultinomialNB(alpha=0.7, cl...",0.0,0.0,0.0,0.0,0.0,1
3,2.476177,0.138633,0.726943,0.097849,"MultinomialNB(alpha=0.7, class_prior=None, fit...",1.0,False,"{'clf__estimator': MultinomialNB(alpha=0.7, cl...",0.0,0.0,0.0,0.0,0.0,1


In [13]:
y_pred_prob = ovr_model.predict_proba(X_test)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"One vs Rest best model's f1-score {score}")

One vs Rest best model's f1-score 0.01589319771137953


## Gridsearch best model for each tag

https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search



In [14]:
# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

# Add any Binary classification model setting here.
# Also add to general parameters to be passed into pipeline below if want to use new model.

lr_params = {
    'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_lr,
        'clf__solver': ['liblinear'],
}

svc_params = {
    'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf'],
        'clf__C': param_range, # np.logspace(-1, 2, 10),
        'clf__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__probability': [True],
}

rf_params = {
    'clf': [RandomForestClassifier()],
        'clf__criterion': ['gini', 'entropy'],
#         'clf__min_samples_leaf': param_range,
#         'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:],
        'clf__n_estimators': [15],
}

mnb_params = {
    'clf': [MultinomialNB()],
        'clf__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
#     {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
    {**vectorizer_params, **rf_params},
#     {**vectorizer_params, **mnb_params}
]

per_tag_pipe = Pipeline([('vectorizer', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('clf', LogisticRegression())])

# scoring = make_scorer(hamming_loss)
scoring = 'f1'
# scoring = 'f1_micro'
# scoring = 'balanced_accuracy'
# scoring = 'precision'
folds = 3
njobs = -1

per_tag_model = GridSearchCV(per_tag_pipe, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [16]:
tags = [tag for tag in mlb.inverse_transform(np.ones(shape=(1, 100)))[0]]
print(tags)
tags.index('technology')

['activism', 'adventure', 'africa', 'animals', 'architecture', 'art', 'beauty', 'big problems', 'biodiversity', 'biology', 'biotech', 'brain', 'business', 'children', 'cities', 'climate change', 'cognitive science', 'collaboration', 'communication', 'community', 'computers', 'creativity', 'culture', 'data', 'demo', 'design', 'disease', 'ecology', 'economics', 'education', 'energy', 'engineering', 'entertainment', 'entrepreneur', 'environment', 'evolution', 'exploration', 'family', 'film', 'food', 'future', 'genetics', 'global development', 'global issues', 'government', 'green', 'happiness', 'health', 'health care', 'history', 'humanity', 'humor', 'identity', 'illness', 'inequality', 'innovation', 'internet', 'invention', 'language', 'leadership', 'life', 'live music', 'math', 'media', 'medical research', 'medicine', 'mental health', 'mind', 'motivation', 'music', 'nature', 'neuroscience', 'oceans', 'parenting', 'peace', 'performance', 'personal growth', 'philosophy', 'photography', 'p

93

In [17]:

for index in range(91, 95): #range(len(tags))
    print(f"Processing {tags[index]}")
    per_tag_model.fit(X_train, y_train[:, index])
#     display(pd.DataFrame(per_tag_model.cv_results_))
    t = 0.2 #threshold value
    prediction_prob = per_tag_model.predict_proba(X_test)
    prediction = (prediction_prob[:, 1] >= t).astype(int)
    # save model or model params somewhere
    print(f'tag {index}: {tags[index]} best model {per_tag_model.best_params_}')
    print(f'tag {index}: {tags[index]} counts - predicted: {sum(prediction)}, actual: {sum(y_test[:, index])}')
    print(f'tag {index}: {tags[index]} test f1-score is {f1_score(y_test[:, index], prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} test accuracy is {accuracy_score(y_test[:, index], prediction)}')
    print('--------------------------')

Processing storytelling
tag 91: storytelling best model {'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'clf__criterion': 'gini', 'clf__min_samples_split': 2, 'clf__n_estimators': 15, 'tfidf__use_idf': True}
tag 91: storytelling counts - predicted: 22, actual: 59
tag 91: storytelling test f1-score is 0.07407407407407408
tag 91: storytelling test accuracy is 0.9362244897959183
--------------------------
Processing sustainability
tag 92: sustainability best model {'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',