# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
import string 

def remove_punct(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            for punct in string.punctuation:
                dataf[col] = [text.replace(punct, '') for text in dataf[col]]
    return dataf

def lower_func(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            for punct in string.punctuation:
                dataf[col] = [text.lower() for text in dataf[col]]
    return dataf

clean_data = lower_func(remove_punct(data))
clean_data['target'] = [0 if k=='neg' else 1 for k in clean_data['target']]
clean_data

Unnamed: 0,target,reviews
0,0,plot two teen couples go to a church party d...
1,0,the happy bastards quick movie review \ndamn t...
2,0,it is movies like these that make a jaded movi...
3,0,quest for camelot is warner bros first fe...
4,0,synopsis a mentally unstable man undergoing p...
...,...,...
1995,1,wow what a movie \nits everything a movie ca...
1996,1,richard gere can be a commanding actor but he...
1997,1,glorystarring matthew broderick denzel washin...
1998,1,steven spielbergs second epic film on world wa...


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [14]:
len(X), len(y)

(2000, 2000)

In [38]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

X = clean_data.reviews
y = clean_data.target

vectorizer_tfidf = TfidfVectorizer()
mnb = MultinomialNB()

# Create Pipeline
#pipe = make_pipeline([vectorizer_tfidf, mnb])

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search (model and vectorizer)
grid = {'tfidf__ngram_range':[(1,1),(1,2),(2,2)],
       'tfidf__stop_words':[None, {'english'}],
       'tfidf__max_df':[0.2,0.5,0.8,0.9],}

# Perform grid search on pipeline
search = GridSearchCV(pipe, 
                      param_grid = grid, 
                      n_jobs = -1,
                     refit=True)
search.fit(X,y)

GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'tfidf__max_df': [0.2, 0.5, 0.8, 0.9],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'tfidf__stop_words': [None, {'english'}]})

In [42]:
search.best_score_, search.best_estimator_

(0.842,
 Pipeline(steps=[('tfidf',
                  TfidfVectorizer(max_df=0.8, ngram_range=(1, 2),
                                  stop_words={'english'})),
                 ('nb', MultinomialNB())]))

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

X = clean_data.reviews
y = clean_data.target

vectorizer_tfidf = TfidfVectorizer()
mnb = MultinomialNB()

# Create Pipeline
#pipe = make_pipeline([vectorizer_tfidf, mnb])

pipe2 = Pipeline([
    ('tfidf', CountVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search (model and vectorizer)
grid = {'tfidf__ngram_range':[(1,1),(1,2),(2,2)],
       'tfidf__stop_words':[None, {'english'}],
       'tfidf__max_df':[0.2,0.5,0.8,0.9],}

# Perform grid search on pipeline
search = GridSearchCV(pipe2, 
                      param_grid = grid, 
                      n_jobs = -1,
                     refit=True)
search.fit(X,y)

GridSearchCV(estimator=Pipeline(steps=[('tfidf', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'tfidf__max_df': [0.2, 0.5, 0.8, 0.9],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'tfidf__stop_words': [None, {'english'}]})

In [44]:
search.best_score_, search.best_estimator_

(0.8375,
 Pipeline(steps=[('tfidf',
                  CountVectorizer(max_df=0.2, ngram_range=(2, 2),
                                  stop_words={'english'})),
                 ('nb', MultinomialNB())]))

⚠️ Please push the exercise once you are done 🙃

## 🏁 