# Bring data

In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV , cross_val_score

In [166]:
#bring your data!
path = '../../../transcriptions/targets/transcriptions_targets.csv'
data = pd.read_csv(f'{path}').set_index('Unnamed: 0')
data.head()



Unnamed: 0_level_0,PHQ8_Binary,text
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
302.0,0.0,i'm fine how about yourself . i'm from los an...
307.0,0.0,<laughter> . um moscow . um my family moved to...
331.0,0.0,yes . okay . connecticut . um . to be an ac...
335.0,1.0,yes . i'm okay . uh i'm from here originally ...
346.0,1.0,yes . i'm okay . here in los angeles . ther...


In [167]:
data[(data.PHQ8_Binary == 1)].count()

PHQ8_Binary    42
text           42
dtype: int64

In [168]:
text_col = 'text' #input('choose the text column of your data')
X = data[f'{text_col}']

target_col = 'PHQ8_Binary' #input('choose the target column of your data')
y = data[f'{target_col}']

# preprocessing

In [169]:

import pandas as pd
import re
import string
import nltk

In [170]:
def expand_contractions(text):
    """ Replace contractions in the english language by the complete phrase"""
    # Contraction dictionary
    contractions = {
      "ain't": "am not",
      "aren't": "are not",
      "can't": "cannot",
      "can't've": "cannot have",
      "'cause": "because",
      "could've": "could have",
      "couldn't": "could not",
      "couldn't've": "could not have",
      "didn't": "did not",
      "doesn't": "does not",
      "don't": "do not",
      "hadn't": "had not",
      "hadn't've": "had not have",
      "hasn't": "has not",
      "haven't": "have not",
      "he'd": "he would",
      "he'd've": "he would have",
      "he'll": "he will",
      "he'll've": "he will have",
      "he's": "he is",
      "how'd": "how did",
      "how'd'y": "how do you",
      "how'll": "how will",
      "how's": "how is",
      "I'd": "I would",
      "I'd've": "I would have",
      "I'll": "I will",
      "I'll've": "I will have",
      "I'm": "I am",
      "I've": "I have",
      "isn't": "is not",
      "it'd": "it had",
      "it'd've": "it would have",
      "it'll": "it will",
      "it'll've": "it will have",
      "it's": "it is",
      "let's": "let us",
      "ma'am": "madam",
      "mayn't": "may not",
      "might've": "might have",
      "mightn't": "might not",
      "mightn't've": "might not have",
      "must've": "must have",
      "mustn't": "must not",
      "mustn't've": "must not have",
      "needn't": "need not",
      "needn't've": "need not have",
      "o'clock": "of the clock",
      "oughtn't": "ought not",
      "oughtn't've": "ought not have",
      "shan't": "shall not",
      "sha'n't": "shall not",
      "shan't've": "shall not have",
      "she'd": "she would",
      "she'd've": "she would have",
      "she'll": "she will",
      "she'll've": "she will have",
      "she's": "she is",
      "should've": "should have",
      "shouldn't": "should not",
      "shouldn't've": "should not have",
      "so've": "so have",
      "so's": "so is",
      "that'd": "that would",
      "that'd've": "that would have",
      "that's": "that is",
      "there'd": "there had",
      "there'd've": "there would have",
      "there's": "there is",
      "they'd": "they would",
      "they'd've": "they would have",
      "they'll": "they will",
      "they'll've": "they will have",
      "they're": "they are",
      "they've": "they have",
      "to've": "to have",
      "wasn't": "was not",
      "we'd": "we had",
      "we'd've": "we would have",
      "we'll": "we will",
      "we'll've": "we will have",
      "we're": "we are",
      "we've": "we have",
      "weren't": "were not",
      "what'll": "what will",
      "what'll've": "what will have",
      "what're": "what are",
      "what's": "what is",
      "what've": "what have",
      "when's": "when is",
      "when've": "when have",
      "where'd": "where did",
      "where's": "where is",
      "where've": "where have",
      "who'll": "who will",
      "who'll've": "who will have",
      "who's": "who is",
      "who've": "who have",
      "why's": "why is",
      "why've": "why have",
      "will've": "will have",
      "won't": "will not",
      "won't've": "will not have",
      "would've": "would have",
      "wouldn't": "would not",
      "wouldn't've": "would not have",
      "y'all": "you all",
      "y'alls": "you alls",
      "y'all'd": "you all would",
      "y'all'd've": "you all would have",
      "y'all're": "you all are",
      "y'all've": "you all have",
      "you'd": "you had",
      "you'd've": "you would have",
      "you'll": "you will",
      "you'll've": "you will have",
      "you're": "you are",
      "you've": "you have"}

    contractions = dict((k.lower(), v.lower()) for k,v in contractions.items())

    c_re = re.compile('(%s)' % '|'.join(contractions.keys()))

    def replace(match):
        return contractions[match.group(0)]
    return c_re.sub(replace, text)

def remove_numbers(text):
    """ Remove numbers """
    words_only = ''.join([w for w in text if not w.isdigit()])
    return words_only

def to_lower(text):
    """ Lower case all the letters of the string """
    return text.lower()

def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    return text

def remove_stop_words(text):
    """ Remove Stop words from text """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('https')
    stopwords.append('com')
    stopwords.append('http')
    stopwords.append('twitter')
    stopwords.append('m')
    stopwords.append('www')
    stopwords = stopwords+['uh' ,'um',  'mm', 'mhm' , 'mmm', 'er' , 'oh',"0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "A", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "after", "afterwards", "ag", "again", "against", "ah", "ain", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appreciate", "approximately", "ar", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "B", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "been", "before", "beforehand", "beginnings", "behind", "below", "beside", "besides", "best", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "C", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "ci", "cit", "cj", "cl", "clearly", "cm", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "could", "couldn", "couldnt", "course", "cp", "cq", "cr", "cry", "cs", "ct", "cu", "cv", "cx", "cy", "cz", "d", "D", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "dj", "dk", "dl", "do", "does", "doesn", "doing", "don", "done", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "E", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "F", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "G", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "H", "h2", "h3", "had", "hadn", "happens", "hardly", "has", "hasn", "hasnt", "have", "haven", "having", "he", "hed", "hello", "help", "hence", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hh", "hi", "hid", "hither", "hj", "ho", "hopefully", "how", "howbeit", "however", "hr", "hs", "http", "hu", "hundred", "hy", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "im", "immediately", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "inward", "io", "ip", "iq", "ir", "is", "isn", "it", "itd", "its", "iv", "ix", "iy", "iz", "j", "J", "jj", "jr", "js", "jt", "ju", "just", "k", "K", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "ko", "l", "L", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "M", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "my", "n", "N", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "neither", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "O", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "otherwise", "ou", "ought", "our", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "P", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "pp", "pq", "pr", "predominantly", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "Q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "R", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "S", "s2", "sa", "said", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "seem", "seemed", "seeming", "seems", "seen", "sent", "seven", "several", "sf", "shall", "shan", "shed", "shes", "show", "showed", "shown", "showns", "shows", "si", "side", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somehow", "somethan", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "sz", "t", "T", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "theyd", "theyre", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "U", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "used", "useful", "usefully", "usefulness", "using", "usually", "ut", "v", "V", "va", "various", "vd", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "W", "wa", "was", "wasn", "wasnt", "way", "we", "wed", "welcome", "well", "well-b", "went", "were", "weren", "werent", "what", "whatever", "whats", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "wi", "widely", "with", "within", "without", "wo", "won", "wonder", "wont", "would", "wouldn", "wouldnt", "www", "x", "X", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "Y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "your", "youre", "yours", "yr", "ys", "yt", "z", "Z", "zero", "zi", "zz"]

    stop_words = set(stopwords)

    word_tokens = nltk.word_tokenize(text)

    filtered_text = [w for w in word_tokens if not w in stop_words]

    text = ' '.join(filtered_text)

    return text


def remove_context_symbol(text):
    return re.sub('<[^>]+>', '', text)








In [171]:
def clean_text(texts_sequence):
    """ Return a preprocessed sequence of texts """
    return texts_sequence.apply(
        to_lower).apply(
        expand_contractions).apply(
        remove_punctuation).apply(
        remove_numbers).apply(remove_stop_words)


In [172]:
X= X.apply(remove_context_symbol)
X = clean_text(X)
X


Unnamed: 0
302.0    fine angeles california family friends mixture...
307.0    moscow family moved moved eventually college l...
331.0    connecticut actor moved san francisco moved at...
335.0    originally angeles weather family moved lot cu...
346.0    angeles lot things love beach love love sunny ...
                               ...                        
485.0    bad tired saint louis missouri yep months ago ...
486.0    feel great saint louis missouri born raised co...
487.0    fine detroit michigan moved family years ago y...
488.0    fine san fernando valley culture love museums ...
491.0    huh overwhelmed funeral attend tomorrow doctor...
Name: text, Length: 142, dtype: object

In [173]:
(data.text)

Unnamed: 0
302.0    i'm fine how about yourself  . i'm from los an...
307.0    <laughter> . um moscow . um my family moved to...
331.0    yes  . okay  . connecticut . um  . to be an ac...
335.0    yes . i'm okay  . uh i'm from here originally ...
346.0    yes  . i'm okay  . here in los angeles  . ther...
                               ...                        
485.0    <synch> . yes . i'm not bad i'm a little tired...
486.0    <synch> . yes . i'm feel great . i am from sai...
487.0    <synch> . yes . i'm fine thank you . detroit m...
488.0    <synch> . yes . fine . oh san fernando valley ...
491.0    <synch> . yes . huh overwhelmed . i have a fun...
Name: text, Length: 142, dtype: object

In [174]:
data['clean_text'] = X

In [177]:
"""data.to_csv('../../../transcriptions/targets/transcriptions_targets_clean.csv')"""

# naive bayes model

## grid search

In [186]:
tfid = TfidfVectorizer()
nb = MultinomialNB()
X = X.apply(str)
X.values

array(['fine angeles california family friends mixture people lot things early childhood education love love working kids seeing smile guess kid happy playful guess transferred adult life working kids school teacher capacity yeah passion fun close close sisters brother mom dad brothers sisters intertwined close lot friends interact different things curse couple days ago nephew kinda mad set time spend blew yeah know mother passed years day better time easier lot lot things daily basis anything driving restaurant likes watching different anything trigger memory mom yesterday good times stay place long ahead officially depressed mean mom passed kinda rough job situation stable california lot yeah kinda sort kinda reserved pretty easy grumpy irritable fidgety big ball nerves mean guess yeah travel meeting people different locations scenery different opportunities something location memphis weeks ago see family friends pretty nice chance see something change change scenery wow stick plan v

In [120]:
pipe = Pipeline([
    ('TfidfVectorizer', tfid),
    ('MultinomialNB()' , nb)
    
])

In [94]:
pipe.get_params()

{'memory': None,
 'steps': [('TfidfVectorizer',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(4, 5), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('MultinomialNB()',
   MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'verbose': False,
 'TfidfVectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(4, 5), norm='l2', preprocessor=None,
              

In [121]:
pipe_grid = {
    'TfidfVectorizer__ngram_range': [(1,2),(1,1) ,(2,2), (2,3), (3,4),(4, 5)],
    'MultinomialNB()__alpha': [0.1 , 0.5 , 1.0]
    
}

In [122]:
search_recall= GridSearchCV(pipe,
    pipe_grid,
    scoring='recall',
    n_jobs=-1,
    
   
    cv=5,
    verbose=1)

In [123]:
search_recall.fit(X , y )

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    5.8s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('TfidfVectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                               

In [124]:
print('the best params for recall: ',search_recall.best_params_)
print('the best recall score: ' , search_recall.best_score_)

the best params for recall:  {'MultinomialNB()__alpha': 0.1, 'TfidfVectorizer__ngram_range': (1, 2)}
the best recall score:  0.0


In [125]:
search_accuracy= GridSearchCV(pipe,
    pipe_grid,
    scoring='accuracy',
    n_jobs=-1,
    
   
    cv=5,
    verbose=1)

In [126]:
search_accuracy.fit(X , y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  79 out of  90 | elapsed:    4.5s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    5.1s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('TfidfVectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                               

In [127]:
print('the best params for accuracy: ',search_accuracy.best_params_)
print('the best accuracy score: ' , search_accuracy.best_score_)

the best params for accuracy:  {'MultinomialNB()__alpha': 0.1, 'TfidfVectorizer__ngram_range': (1, 2)}
the best accuracy score:  0.704433497536946


In [128]:
search_precision= GridSearchCV(pipe,
    pipe_grid,
    scoring='precision',
    n_jobs=-1,
    
   
    cv=3,
    verbose=1)

In [129]:
search_precision.fit(X , y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    2.8s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('TfidfVectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                               

In [130]:
print('the best params for precision: ',search_precision.best_params_)
print('the best precision score: ' , search_precision.best_score_)

the best params for precision:  {'MultinomialNB()__alpha': 0.1, 'TfidfVectorizer__ngram_range': (1, 2)}
the best precision score:  0.0


In [131]:
search_f1= GridSearchCV(pipe,
    pipe_grid,
    scoring='f1',
    n_jobs=-1,
    
   
    cv=3,
    verbose=1)

In [132]:
search_f1.fit(X , y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    2.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('TfidfVectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                               

In [133]:
print('the best params for f1: ',search_f1.best_params_)
print('the best f1 score: ' , search_f1.best_score_)

the best params for f1:  {'MultinomialNB()__alpha': 0.1, 'TfidfVectorizer__ngram_range': (1, 2)}
the best f1 score:  0.0


para precision cambio el bestparams. hago un cross_validate para precision usando el bestparams de los otros scores

In [134]:
best_accuracy = search_accuracy.best_estimator_

precision_cv = cross_val_score(best_accuracy,
    X,
    y,
    scoring='precision',
    cv=5,
    n_jobs=-1,
    verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [135]:
print('precision for params ',search_accuracy.best_params_ ,' is: ',  np.mean(precision_cv))

precision for params  {'MultinomialNB()__alpha': 0.1, 'TfidfVectorizer__ngram_range': (1, 2)}  is:  0.0


# testeo y analisis del modelo

In [None]:
'''{'MultinomialNB()__alpha': 1.0, 'TfidfVectorizer__ngram_range': (1, 2)}'''

In [178]:
tfid3 = TfidfVectorizer(ngram_range=(1,2))
nb3 = MultinomialNB(alpha = 0.1)

In [179]:
tfid3.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [180]:
vector3 = tfid3.transform(X)

In [181]:
tfid3.get_feature_names()

['abandoned',
 'abandoned buildings',
 'abandoned pretty',
 'abandoning',
 'abandoning family',
 'abbey',
 'abbey premiere',
 'abilities',
 'abilities hard',
 'abilities music',
 'abilities people',
 'ability',
 'ability deal',
 'ability decision',
 'ability knack',
 'ability multitask',
 'ability situations',
 'ability survive',
 'ability work',
 'abnormal',
 'abnormal lotta',
 'abort',
 'abort choosing',
 'abort chose',
 'abortion',
 'abortion point',
 'abroad',
 'abroad brazil',
 'abroad madrid',
 'abroad program',
 'absent',
 'absent minded',
 'absolute',
 'absolute hardest',
 'absolutely',
 'absolutely amazing',
 'absolutely awful',
 'absolutely breathtakingly',
 'absolutely fantastic',
 'absolutely fell',
 'absolutely know',
 'absolutely love',
 'absolutely real',
 'absolutely yeah',
 'absorb',
 'absorb something',
 'absorbed',
 'absorbed small',
 'absorption',
 'absorption seemingly',
 'absurd',
 'absurd mistake',
 'absurd thing',
 'abundance',
 'abundance life',
 'abuse',
 'abu

In [182]:
nb3.fit(vector3, y)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [183]:
nb3.coef_

array([[-10.94353554, -10.94353554, -10.94353554, ..., -10.94353554,
        -10.94353554, -10.94353554]])

In [184]:
coefs = pd.Series(nb3.coef_[0], index = tfid3.get_feature_names())


In [185]:
coefs.sort_values(ascending=False).head(30)

#agregar palabras a las stopwords , por ejemplo 'twitter', 'com'
#agregar precision, accuracy, recall , f1 , etc
#min, max 
#max_features:overfitting? ~ regularizacion

#to do: traduccion?
#red de deeplearning para buscar mejor score
#testeos
#

know        -7.151806
yeah        -7.573594
people      -7.693193
lot         -7.773834
things      -7.829781
good        -8.027408
time        -8.091605
mean        -8.173863
feel        -8.241994
guess       -8.312596
pretty      -8.399882
life        -8.438185
kind        -8.456745
something   -8.490391
hard        -8.524653
see         -8.541009
different   -8.576550
sleep       -8.579050
school      -8.588936
years       -8.622163
better      -8.625952
kinda       -8.629440
person      -8.671943
love        -8.692099
ago         -8.719000
anything    -8.724365
day         -8.739806
job         -8.746987
never       -8.772392
basically   -8.773210
dtype: float64

# save model

In [75]:
import joblib

In [76]:
'''joblib.dump(nb3, 'nb3_model.sav')'''

['nb3_model.sav']

In [78]:
'''joblib.dump(tfid3 , 'tfid3_vectorizer.sav')'''

['tfid3_vectorizer.sav']

In [None]:
'''loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, Y_test)
print(result'''