# Initial Model Building

Date: 01/12/2019

## Loading and Splitting the Data

In [1]:
import pandas as pd
import numpy as np
import os
# Change working directory for locating dataset
os.chdir("..")
# to make this notebook's output stable across runs
np.random.seed(42)

# Libraries for producing images
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Configuration for saving images
PROJECT_ROOT_DIR = "."
FOLDER = "notebooks"
IMAGE_FOLDER = "images"

# Dataset location
from genres_from_plot_outline.config import dataset_loc

### Loading Data

In [2]:
def get_main_dataset(loc):
    """
    IMDb ID's sometimes contain a '0' before the ID, thus 
    we must ensure we load it as a string and not a number.
    
    """
    data = pd.read_csv(loc, usecols = ['imdbId', 'title', 'plot_outline', 'genres'], dtype = {'imdbId': str})
    
    return data

In [3]:
data = get_main_dataset(dataset_loc)

### Splitting Data

To avoid snooping bias - will be done later instead

In [4]:
# from sklearn.model_selection import train_test_split

# train_set, test_set = train_test_split(data_full, test_size=0.2, 
#                                        random_state=42)

# print(len(train_set), "train +", len(test_set), "test")

In [5]:
# Make a copy so as not to harm the original
# Don't wan't to keep indexes
#data = train_set.copy(deep = False)

## Data Preprocessing

### Extracting Genres

In [6]:
def genre_list_extractor(dataset):
    '''
    Creates a column containing the genres as a list
    '''
    data = dataset.copy()
    genres_list = []
    
    for i in data.index:
        genre_string = str(data['genres'][i])
        genre_list = genre_string.replace(',', '').split(" ")
        genres_list.append(genre_list)
        
    data['genre_list'] = genres_list
    return data

In [7]:
data = genre_list_extractor(data)

### Removing Rows without any Genres

In [8]:
def delete_nan_genres(data):
    '''
    Deletes rows where the genre is listed as nan
    '''
    rows_with_nan = []
    for i in data.index:
        if 'nan' in data['genre_list'][i]:
            rows_with_nan.append(i)
    
    data_clean = data.drop(rows_with_nan)
    
    print(len(data)-len(data_clean), 
          "rows with missing genres have been removed")
    
    return data_clean

In [9]:
data = delete_nan_genres(data)

20 rows with missing genres have been removed


### Removing Rows without Plot Outlines

In [10]:
def delete_rows_with_missing_values(data):
    '''
    Deletes rows where any column contains a missing value
    '''
    data_clean = data.dropna()
    fraction_removed = (len(data)-len(data_clean))/len(data)
    
    print(round(fraction_removed*100, 2), 
          "percent of rows have been removed")
    
    return data_clean    

In [11]:
data = delete_rows_with_missing_values(data)

15.37 percent of rows have been removed


### Get Number of Sentences

In [12]:
from nltk.tokenize import sent_tokenize

In [13]:
def sentence_extractor(data):
    '''
    Creates a column with number of sentences
    Don't need specific sentences - too noisy
    '''
    data = data.copy()
    num_sentences_list = []
    
    for i in data.index:
        sentences = sent_tokenize(str(data['plot_outline'][i]).lower())
        num_sentences = len(sentences)
        num_sentences_list.append(num_sentences)
        
    data['plot_outline_num_sentences'] = num_sentences_list
    
    return data    

In [14]:
data = sentence_extractor(data)

### Tokenization

In [15]:
from nltk import wordpunct_tokenize

# Stemming for feature reduction
from nltk.stem import SnowballStemmer

# Remove punctuation
import string

# Remove stopwords
from nltk.corpus import stopwords

In [16]:
def word_tokenize(data):
    '''
    Creates a column with cleaned tokens and the number of tokens
    with and without stemming
    '''
    
    stop_words = set(stopwords.words('english'))
    stem = SnowballStemmer('english')
    
    data = data.copy()
    
    tokens_list = []
    tokens_stemmed_list = []
    num_tokens_list = []
    
    for i in data.index:
        
        # Convert to lowercase and tokenize text
        outline = str(data['plot_outline'][i]).lower() 
        outline_list = wordpunct_tokenize(outline)
        
        # Using list comprehensions to remove stopwords and punctuation
        outline_list_no_stopwords = [word for word in outline_list if not word in stop_words]
        tokens = [word for word in outline_list_no_stopwords if not word in list(string.punctuation)]
        
        # Stemmed tokens 
        tokens_stem = [stem.stem(word) for word in tokens]
        
        num_tokens = len(tokens)
        
        tokens_list.append(tokens)
        tokens_stemmed_list.append(tokens_stem)
        num_tokens_list.append(num_tokens)
        
    data['plot_outline_words'] = tokens_list
    data['plot_outline_stemmed_words'] = tokens_stemmed_list
    data['plot_outline_num_words'] = num_tokens_list
    
    return data 

In [17]:
data = word_tokenize(data)

In [18]:
data.head()

Unnamed: 0,imdbId,title,plot_outline,genres,genre_list,plot_outline_num_sentences,plot_outline_words,plot_outline_stemmed_words,plot_outline_num_words
0,114709,Toy Story,A little boy named Andy loves to be in his roo...,"Animation, Adventure, Comedy, Family, Fantasy","[Animation, Adventure, Comedy, Family, Fantasy]",8,"[little, boy, named, andy, loves, room, playin...","[littl, boy, name, andi, love, room, play, toy...",75
1,113497,Jumanji,After being trapped in a jungle board game for...,"Adventure, Comedy, Family, Fantasy","[Adventure, Comedy, Family, Fantasy]",3,"[trapped, jungle, board, game, 26, years, man,...","[trap, jungl, board, game, 26, year, man, chil...",22
2,113228,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,"Comedy, Romance","[Comedy, Romance]",3,"[things, seem, change, much, wabasha, county, ...","[thing, seem, chang, much, wabasha, counti, ma...",63
3,114885,Waiting to Exhale,This story based on the best selling novel by ...,"Comedy, Drama, Romance","[Comedy, Drama, Romance]",3,"[story, based, best, selling, novel, terry, mc...","[stori, base, best, sell, novel, terri, mcmill...",39
4,113041,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...","Comedy, Family, Romance","[Comedy, Family, Romance]",3,"[sequel, father, bride, "",, george, banks, mus...","[sequel, father, bride, "",, georg, bank, must,...",46


In [19]:
def words_per_sentence(data):
    '''
    Function to get number of words (after cleaning) per
    sentence
    '''
    data['words_per_sentence'] = data['plot_outline_num_words']/data['plot_outline_num_sentences']
    
    return data

In [20]:
data = words_per_sentence(data)

## Preparing the Data for ML Algorithms

In [27]:
# Vecotrising is typically applied to strings of text, 
# but since the tokenization is already done
# we will join the lists of strings together to be later used
def cleaned_plot_outline(data):
    """
    Function to join cleaned tokenized words into 
    full string again
    """
    
    data = data.copy()
    
    cleaned_outlines = []
    cleaned_outlines_stemmed = []
    
    for i in data.index:
        
        outline_clean = ' '.join(data['plot_outline_words'][i])
        outline_clean_stemmed = ' '.join(data['plot_outline_stemmed_words'][i])
        
        cleaned_outlines.append(outline_clean)
        cleaned_outlines_stemmed.append(outline_clean_stemmed)
        
    data['plot_outline_clean'] = cleaned_outlines
    data['plot_outline_clean_stemmed'] = cleaned_outlines_stemmed 
    
    return data

In [28]:
data = cleaned_plot_outline(data)
data.head()

Unnamed: 0,imdbId,title,plot_outline,genres,genre_list,plot_outline_num_sentences,plot_outline_words,plot_outline_stemmed_words,plot_outline_num_words,words_per_sentence,plot_outline_clean,plot_outline_clean_stemmed
0,114709,Toy Story,A little boy named Andy loves to be in his roo...,"Animation, Adventure, Comedy, Family, Fantasy","[Animation, Adventure, Comedy, Family, Fantasy]",8,"[little, boy, named, andy, loves, room, playin...","[littl, boy, name, andi, love, room, play, toy...",75,9.375,little boy named andy loves room playing toys ...,littl boy name andi love room play toy especi ...
1,113497,Jumanji,After being trapped in a jungle board game for...,"Adventure, Comedy, Family, Fantasy","[Adventure, Comedy, Family, Fantasy]",3,"[trapped, jungle, board, game, 26, years, man,...","[trap, jungl, board, game, 26, year, man, chil...",22,7.333333,trapped jungle board game 26 years man child w...,trap jungl board game 26 year man child win re...
2,113228,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,"Comedy, Romance","[Comedy, Romance]",3,"[things, seem, change, much, wabasha, county, ...","[thing, seem, chang, much, wabasha, counti, ma...",63,21.0,things seem change much wabasha county max joh...,thing seem chang much wabasha counti max john ...
3,114885,Waiting to Exhale,This story based on the best selling novel by ...,"Comedy, Drama, Romance","[Comedy, Drama, Romance]",3,"[story, based, best, selling, novel, terry, mc...","[stori, base, best, sell, novel, terri, mcmill...",39,13.0,story based best selling novel terry mcmillan ...,stori base best sell novel terri mcmillan foll...
4,113041,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...","Comedy, Family, Romance","[Comedy, Family, Romance]",3,"[sequel, father, bride, "",, george, banks, mus...","[sequel, father, bride, "",, georg, bank, must,...",46,15.333333,"sequel father bride "", george banks must accep...","sequel father bride "", georg bank must accept ..."


In [29]:
# Separate features from target variable
df_x = data[['title', 'plot_outline', 'plot_outline_num_sentences', 
             'plot_outline_words', 'plot_outline_stemmed_words',
            'plot_outline_num_words', 'words_per_sentence', 'plot_outline_clean',
            'plot_outline_clean_stemmed']]
df_y = data[['genre_list']]

> After all data exploration, let’s concentrate now on building the actual model. As it is a multi-label classification, we need to convert our target label into a binarised vector with multiple bits set as 1.
‘MultiLabelBinarizer’ of ‘scikit-learn’ can do that

https://medium.com/towards-artificial-intelligence/multi-label-text-classification-using-scikit-multilearn-case-study-with-stackoverflow-questions-768cb487ad12

Since we want to ensure all genres are encorporated, we will apply MultiLabelBinarizer to the full dataset

## Applying Algorithms

### Example 1

https://www.kaggle.com/roccoli/multi-label-classification-with-sklearn

OneVsRest with different classifiers
use OneVsRest strategy to have one classifier for each class/label

In [24]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
y_encoded = multilabel_binarizer.fit_transform(df_y['genre_list'])

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_x['plot_outline_clean'])

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

Define our model training pipeline stuff here now

Best try and use gensim

In [32]:
from sklearn.model_selection import train_test_split

# just plot outline - no other features
train_x, test_x, train_y, test_y = train_test_split(X_tfidf, y_encoded)

In [35]:
# From https://www.kaggle.com/roccoli/multi-label-classification-with-sklearn

# For people reading in 2019, now you can use sklearn.metrics.hamming_loss:

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Hamming loss: {}".format(hamming_loss(y_pred, test_y)))
    print("Hamming score: {}".format(hamming_score(y_pred, test_y)))
    print("---")    

In [36]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
lr = LogisticRegression()
mn = MultinomialNB()

for classifier in [nb_clf, sgd, lr, mn]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(train_x, train_y)
    y_pred = clf.predict(test_x)
    print_score(y_pred, classifier)

Clf:  MultinomialNB
Hamming loss: 0.08981512480005951
Hamming score: 0.24793181467938746
---
Clf:  SGDClassifier
Hamming loss: 0.09387593150566033
Hamming score: 0.2503737093542918
---
Clf:  LogisticRegression
Hamming loss: 0.07603318082059295
Hamming score: 0.36297733299323526
---
Clf:  MultinomialNB
Hamming loss: 0.08981512480005951
Hamming score: 0.24793181467938746
---


### Example 2

https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments

In [60]:
# Separate features from target variable
df_x = data[['title', 'plot_outline', 'plot_outline_num_sentences', 
             'plot_outline_words', 'plot_outline_stemmed_words',
            'plot_outline_num_words', 'words_per_sentence', 'plot_outline_clean',
            'plot_outline_clean_stemmed']]
df_y = data[['genre_list']]

In [61]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
y_encoded = multilabel_binarizer.fit_transform(df_y['genre_list'])
y = pd.DataFrame(y_encoded, columns=multilabel_binarizer.classes_)

In [62]:
# split dataset into training and validation set
train_x, test_x, train_y, test_y = train_test_split(df_x['plot_outline_clean'], y, test_size=0.2)

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(
    #stop_words='english',
    #sublinear_tf=True,
    #strip_accents='unicode',
    #analyzer='word',
    #token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    #ngram_range=(1, 1),
    ##max_features=30000
)

# fit and transform on it the training features
word_vectorizer.fit(train_x)
train_x_word_features = word_vectorizer.transform(train_x)

#transform the test features to sparse matrix
# Dont fit to test - just transform!!
test_features = word_vectorizer.transform(test_x)

# transform the holdout text for submission at the end
# holdout_text = holdout['comment_text']
# holdout_word_features = word_vectorizer.transform(holdout_text)

**Classify**

Run a Logistic regression on each label separately

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

losses = []
auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = train_y[class_name]
    test_target = test_y[class_name]
    classifier = LogisticRegression(solver='sag', C=10)

    cv_loss = np.mean(cross_val_score(classifier, train_x_word_features, train_target, cv=5, scoring='neg_log_loss'))
    losses.append(cv_loss)
    print('CV Log_loss score for class {} is {}'.format(class_name, cv_loss))

    cv_score = np.mean(cross_val_score(classifier, train_x_word_features, train_target, cv=5, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_x_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
    print(confusion_matrix(test_target, y_pred))
    print(classification_report(test_target, y_pred))

print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

CV Log_loss score for class Action is -0.28310804593243993
CV Accuracy score for class Action is 0.8828611610809493
CV ROC_AUC score 0.886959743146456

[[3915  134]
 [ 393  337]]
             precision    recall  f1-score   support

          0       0.91      0.97      0.94      4049
          1       0.72      0.46      0.56       730

avg / total       0.88      0.89      0.88      4779





CV Log_loss score for class Adult is -0.0023963046799023243




CV Accuracy score for class Adult is 0.9997384116325859
CV ROC_AUC score 0.9497697781498535

[[4778    0]
 [   1    0]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      4778
          1       0.00      0.00      0.00         1

avg / total       1.00      1.00      1.00      4779



  'precision', 'predicted', average, warn_for)


CV Log_loss score for class Adventure is -0.2572911639922875
CV Accuracy score for class Adventure is 0.8999689422842518
CV ROC_AUC score 0.8716391182909599

[[4111  120]
 [ 359  189]]
             precision    recall  f1-score   support

          0       0.92      0.97      0.94      4231
          1       0.61      0.34      0.44       548

avg / total       0.88      0.90      0.89      4779

CV Log_loss score for class Animation is -0.11513584259079881
CV Accuracy score for class Animation is 0.961703590609635
CV ROC_AUC score 0.9116233340616124

[[4565   12]
 [ 171   31]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      4577
          1       0.72      0.15      0.25       202

avg / total       0.95      0.96      0.95      4779

CV Log_loss score for class Biography is -0.16121956735678583
CV Accuracy score for class Biography is 0.9474208202768419
CV ROC_AUC score 0.8621735706713582

[[4486   16]
 [ 247   30]]
             p

  'precision', 'predicted', average, warn_for)


CV Log_loss score for class History is -0.14533650257688682
CV Accuracy score for class History is 0.9521292740118643
CV ROC_AUC score 0.8944772228320528

[[4536   19]
 [ 193   31]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      4555
          1       0.62      0.14      0.23       224

avg / total       0.94      0.96      0.94      4779

CV Log_loss score for class Horror is -0.1866458912693896
CV Accuracy score for class Horror is 0.9281154591456897
CV ROC_AUC score 0.9235865382019409

[[4186   51]
 [ 291  251]]
             precision    recall  f1-score   support

          0       0.94      0.99      0.96      4237
          1       0.83      0.46      0.59       542

avg / total       0.92      0.93      0.92      4779

CV Log_loss score for class Music is -0.10602490304593226
CV Accuracy score for class Music is 0.966830700860738
CV ROC_AUC score 0.8934720924061799

[[4572   25]
 [ 130   52]]
             precision    recall

  'precision', 'predicted', average, warn_for)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

### Example 3 - Use ML Book

In [67]:
# Separate features from target variable
df_x = data[['title', 'plot_outline', 'plot_outline_num_sentences', 
             'plot_outline_words', 'plot_outline_stemmed_words',
            'plot_outline_num_words', 'words_per_sentence', 'plot_outline_clean',
            'plot_outline_clean_stemmed']]
df_y = data[['genre_list']]

In [69]:
# Split data and shuffle for cross validation
from sklearn.model_selection import train_test_split
#import numpy as np

# split all features
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size = 0.2)

# Shuffle training set
#shuffle_index = np.random.permutation(60000)
#train_x, y_train = train_x[shuffle_index], y_train[shuffle_index]

In [72]:
def variable_parser(data, stemmed = False):
    """
    Function to select and split certain attributes
    """
    data_outline_attributes = data[['plot_outline_num_sentences', 
                                  'plot_outline_num_words',
                                  'words_per_sentence']]
    if stemmed:
        data_outline = data['plot_outline_clean_stemmed']
    else:
        data_outline = data['plot_outline_clean']
        
    return data_outline_attributes, data_outline

In [76]:
df_x_attributes, df_x_outline = variable_parser(df_x)

train_x_attributes, train_x_outline = variable_parser(train_x)
test_x_attributes, test_x_outline = variable_parser(test_x)

In [75]:
# Transform target variable
from sklearn.preprocessing import MultiLabelBinarizer

# Fit on whole data set - so that genres aren't missing
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_y['genre_list'])

# transform on both training and test
train_y_encoded = multilabel_binarizer.transform(train_y['genre_list'])
test_y_encoded = multilabel_binarizer.transform(test_y['genre_list'])

Fitting the tfidf on the whole dataset may introduce bias towards the test set

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer, use default for now
word_vectorizer = TfidfVectorizer()

# fit and transform on it the training features
word_vectorizer.fit(df_x_outline)
train_x_word_features = word_vectorizer.transform(train_x_outline)

#transform the test features to sparse matrix
test_x_word_features = word_vectorizer.transform(test_x_outline)

**Want hamming_loss in cross validation**

[Here](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values) lists the scoring options. But there's also a section *Defining your scoring strategy from metric functions*

```python
>>> from sklearn.metrics import fbeta_score, make_scorer
>>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
>>> from sklearn.model_selection import GridSearchCV
>>> from sklearn.svm import LinearSVC
>>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},
...                     scoring=ftwo_scorer, cv=5)
```

This issue was raised [here](https://stackoverflow.com/questions/44081222/hamming-loss-not-support-in-cross-val-score)

>@iromise In case of losses, scores just return the negative of actual score. That is the intended behaviour, when we want to compare two scores. Because in cases of accuracy or f1_score, the higher score is better, but in case of losses, lower score is better. To handle them both in same way, it returns the negative. Read the section 3.3.1.1 carefully. Just multiply by -1 to return your actual hamming loss values. – Vivek Kumar May 20

Using OneVsRestClassifier classifier here
- Could I try OneVsOne here either

[On the other hand, LinearSVC implements “one-vs-the-rest” multi-class strategy, thus training n_class models. If there are only two classes, only one model is trained:](https://scikit-learn.org/stable/modules/svm.html)

After a promissing model is chosen, we will tune paramaters in tfidf and for stemmed/not stemmed

In [184]:
def apply_and_record_model(clf, train_x_word_features, train_x_attributes, 
                           train_y_encoded, include_text_attributes = False):
    """
    Function to train model and record results.
    No parameter tuning for tfidf and not using stemmed outline
    """
    
    # Empty list to store info
    model_info = []
    
    #---- Record name of model -----#
    
    print("Classifier: ", clf.__class__.__name__)
    model_info.append(str(clf.__class__.__name__))
    print("---") 
    
    #----- Include or not include additional text_attributes -----#
    
    if include_text_attributes:
        
        # Concatenating with sparse TFIDF matrix that is pre fit already
        train_x_final = hstack((train_x_word_features, train_x_attributes))
        
    else:
        
        train_x_final = train_x_word_features
    
    #----- Training classifier -----#
    
    print("Training...")
    start_time = time.time()
    clf = OneVsRestClassifier(clf)
    
    # Getting scores using cross validation with hamming_loss as scorer
    cv_scores = (-1) * cross_val_score(clf, train_x_final, train_y_encoded, cv = 5, 
                                       scoring = make_scorer(hamming_loss, greater_is_better = False))
    cv_scores = np.array(cv_scores)
    
    training_time = time.time() - start_time
    print("Training time: ", round(training_time/60, 2), "minutes.")
    model_info.append(round(training_time/60, 2))
    
    #---- Recording CV results -----#
    
    hamming_loss_avg_on_val = np.mean(cv_scores)
    hamming_loss_std_dev_on_val = np.std(cv_scores)
    
    model_info.append(hamming_loss_avg_on_val)
    model_info.append(hamming_loss_std_dev_on_val)
    
    print("Average Hamming Loss: ", hamming_loss_avg_on_val)
    print("----------------------")   
    
    return pd.Series(model_info, index = ['classifier', 'training_time_mins',
                                    'hamming_loss_avg_on_val', 
                                    'hamming_loss_std_dev_on_val'])

## Looping Over Combinations

In [185]:
def loop_and_record_model_combinations(df_x, train_x, train_y_encoded,
                                       vectorizer_list, classifier_list):
    """
    Function to take combinations of,
    
    parameters and apply models to each one
    """
    
    start_time = time.time()
    stemmed_record = []
    vectorizer_record = []
    text_attributes_used_record = []
    
    all_model_info = pd.DataFrame(columns=['classifier', 'training_time_mins',
                                    'hamming_loss_avg_on_val', 
                                    'hamming_loss_std_dev_on_val'])

    #----- Loop over stemmed/not stemmed outlines -----#
    for stemmed in [False, True]:

        print("Stemmed: ", stemmed)
        df_x_attributes, df_x_outline = variable_parser(df_x, stemmed = stemmed)
        train_x_attributes, train_x_outline = variable_parser(train_x, stemmed = stemmed)

        #---- Loop over vectorizer -----#
        for vectorizer in vectorizer_list:

            print("Vectorizer: ", vectorizer, ". Fitting...")
            # Fit on whole dataset but transform on training set
            vectorizer.fit(df_x_outline)
            train_x_word_features = vectorizer.transform(train_x_outline)

            #---- Loop over classifiers -----#
            for classifier in classifier_list:

                for include_text_attributes in [False, True]:

                    print("Including text attributes: ", include_text_attributes)
                    print("---")
                    model_info = apply_and_record_model(classifier, train_x_word_features, 
                                                        train_x_attributes, train_y_encoded,
                                                        include_text_attributes = include_text_attributes)
                    all_model_info = all_model_info.append(model_info, ignore_index = True)

                    stemmed_record.append(str(stemmed))
                    vectorizer_record.append(vectorizer)
                    text_attributes_used_record.append(str(include_text_attributes))


    all_model_info['vectorizer_record'] = vectorizer_record
    all_model_info['stemmed_record'] = stemmed_record
    all_model_info['text_attributes_used_record'] = text_attributes_used_record
    
    run_time = time.time() - start_time
    print("Run time: ", round(run_time/60, 2), "minutes.")
    
    return all_model_info

In [186]:
import time
import numpy as np
from scipy.sparse import hstack

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import make_scorer, hamming_loss

# Use default parameters
nb = MultinomialNB()
sgd = SGDClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
# rf = RandomForestClassifier(random_state=42)
svc = LinearSVC(random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer()
word_vectorizer_uni_bi = TfidfVectorizer(ngram_range = (1, 2))
word_vectorizer_bi = TfidfVectorizer(ngram_range = (2, 2))

In [None]:
vectorizer_list = [word_vectorizer, word_vectorizer_uni_bi, word_vectorizer_bi]  
classifier_list = [nb, sgd, lr, svc]

model_and_combination_df = loop_and_record_model_combinations(df_x, train_x, train_y_encoded, 
                                                              vectorizer_list, classifier_list)

Stemmed:  False
Vectorizer:  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) . Fitting...
Including text attributes:  False
---
Classifier:  MultinomialNB
---
Training...


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Training time:  0.14 minutes.
Average Hamming Loss:  0.09034275936705813
----------------------
Including text attributes:  True
---
Classifier:  MultinomialNB
---
Training...


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Training time:  0.15 minutes.
Average Hamming Loss:  0.09299668206953914
----------------------
Including text attributes:  False
---
Classifier:  SGDClassifier
---
Training...




  str(classes[c]))


  str(classes[c]))
  str(classes[c]))




  str(classes[c]))


  str(classes[c]))




  str(classes[c]))


Training time:  0.28 minutes.
Average Hamming Loss:  0.07355947459398215
----------------------
Including text attributes:  True
---
Classifier:  SGDClassifier
---
Training...




  str(classes[c]))


  str(classes[c]))
  str(classes[c]))




  str(classes[c]))


  str(classes[c]))




  str(classes[c]))


Training time:  0.34 minutes.
Average Hamming Loss:  0.1580337697757414
----------------------
Including text attributes:  False
---
Classifier:  LogisticRegression
---
Training...


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Training time:  1.49 minutes.
Average Hamming Loss:  0.07794752698017682
----------------------
Including text attributes:  True
---
Classifier:  LogisticRegression
---
Training...


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Training time:  6.17 minutes.
Average Hamming Loss:  0.07790825376243933
----------------------
Including text attributes:  False
---
Classifier:  LinearSVC
---
Training...


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Training time:  1.77 minutes.
Average Hamming Loss:  0.07245986372209597
----------------------
Including text attributes:  True
---
Classifier:  LinearSVC
---
Training...


In [179]:
#model_and_combination_df.to_csv(r"C:\Users\User\Documents\ITB Year 2\Text Analytics and Web Content Mining\Assignments\Assignment 2\versioned_code\Genres-from-Plot-Outline\data\model_and_combination.csv")

In [136]:
for x in [False, True]:
    if x:
        print("show second")
    else:
        print("show first")

show first
show second


In [177]:
model_and_combination_df

Unnamed: 0,classifier,training_time_mins,hamming_loss_avg_on_val,hamming_loss_std_dev_on_val,vectorizer_record,stemmed_record,text_attributes_used_record
0,MultinomialNB,0.93,0.092838,0.000709,"TfidfVectorizer(analyzer='word', binary=False,...",False,False
1,MultinomialNB,1.15,0.094553,0.000673,"TfidfVectorizer(analyzer='word', binary=False,...",False,True
2,SGDClassifier,1.11,0.080031,0.00074,"TfidfVectorizer(analyzer='word', binary=False,...",False,False
3,SGDClassifier,1.1,0.15475,0.025076,"TfidfVectorizer(analyzer='word', binary=False,...",False,True
4,LogisticRegression,11.27,0.085056,0.000706,"TfidfVectorizer(analyzer='word', binary=False,...",False,False
5,LogisticRegression,44.52,0.085103,0.000744,"TfidfVectorizer(analyzer='word', binary=False,...",False,True
6,LinearSVC,3.51,0.074798,0.000977,"TfidfVectorizer(analyzer='word', binary=False,...",False,False
7,LinearSVC,139.35,0.097151,0.005942,"TfidfVectorizer(analyzer='word', binary=False,...",False,True
8,MultinomialNB,0.8,0.092838,0.000709,"TfidfVectorizer(analyzer='word', binary=False,...",False,False
9,MultinomialNB,0.82,0.094553,0.000673,"TfidfVectorizer(analyzer='word', binary=False,...",False,True
