In [None]:
import hashlib # for grading

# Standard imports
import numpy as np
from numpy.testing import assert_allclose
import pandas as pd
import re
import string
import math
import warnings; warnings.simplefilter('ignore')

# NLTK imports
import nltk
nltk.download('stopwords')

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Q1. Cars Price

For the first question, you will be making use of regex to extract some information from the dataset `cars.txt` in the data folder. In this dataset, you'll find a list of cars that have been sold, as well as their brand, model and selling price.

Start by loading the data into a list of sentences. Each sentence is separated by a new line in the `.txt` file:


In [None]:
path = "data/cars.txt"
cars = []
with open(path, 'r', encoding='utf-8') as f:
    cars = [l.strip() for l in f.readlines()]

In [None]:
cars[:10]

In the first item, for example, `FORD` is the brand name, `Focus` is the model, and `19757` is the price.

#### Q1.a)

First, we want to see which `TOYOTA` cars have been sold. Return the full strings corresponding to cars that belong to this brand in a list assigned to a variable `ans`.

In [None]:
# ans = [ ... ]

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert len(ans) == 111
assert hashlib.sha256(' '.join(sorted(ans)).encode()).hexdigest() == \
    'e187e5b04c2a7cb863905c1abe8d5a791782dd80a1e3ae08430978e0bc5623e1'

### Q1.b)

Next, find all cars whose model is a set of numbers instead of characters. For example, `'BMW -- 535 -- 23521'`.

Return the full strings corresponding to these cars  in the variable `ans_models`.

In [None]:
# ans_models = ...

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert len(ans_models) == 73
assert hashlib.sha256(' '.join(sorted(ans_models)).encode()).hexdigest() == \
    '5f08454f40915f9cd36de32ed145322f618d01051db706c1f6f81c5e8877c8e7'

### Q1.c)

Finally, get the car brands and models whose selling price was bellow 1000.

Save the results in the list `ans_price`. Each element on this list should be in the format `BRAND -- MODEL`

In [None]:
# ans_price = ...

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert len(ans_price) == 77
assert hashlib.sha256(' '.join(sorted(ans_price)).encode()).hexdigest() == \
    '7b0ee55a70f3aab0435a140162df3f89aff25ba1b3b1c7b9c2b7d43e1660044d'

# Q2. Job postings (preprocessing)

The challenge of this exercise notebook is to classify job posting as 'Fake' or 'Real' using the posts' text. To do that, we first need to preprocess the data.

Let's start by briefly analyzing it:

In [None]:
df = pd.read_csv('data/job_postings.csv', index_col=0)

X = df['description']
y = df['fraudulent']

Let's look at an example of a job description and it's corresponding label:

In [None]:
df.iloc[10]['description']

In [None]:
df.iloc[10]['fraudulent']

Let's check the data size and distribution of classes:

In [None]:
def get_data_stats(X, y):
    print(f"Size of dataset: {len(X)}")
    unique, counts = np.unique(y, return_counts=True)
    print(f"Distribution of classes: {dict(zip(unique, counts))}")

get_data_stats(X, y)

The classes are evenly distributed. We'll use a dev and test sets to be able to identify overfitting and check the performance on unseen data.

**Note**: So far you've used the `train`/`val`/`test` nomenclature for naming variables related to training, validation and test sets, respectively. `dev` is short for "development" and is just another typical identifier for the validation set, and we'll use it throughout this notebook instead of `val`

In [None]:
# train dev test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)
print(f"Train size: {len(X_train)}\nDev size: {len(X_dev)}\nTest size: {len(X_test)}")

Since the goal is to turn the strings in X into useful features, now we will be performing common preprocessing operations on the texts.

#### Q2.a)

First, tokenize the data. Implement the function to receive a list of strings and an NLTK-style tokenizer, and return the list but with tokenized strings.

In [None]:
def apply_tokenizer(data, tokenizer):
    """
    Returns a list of sentences that have been tokenized with the provided `tokenizer`
    
    E.g. for an input ["This is a test!", "No, it can't be"],
         it should return ["This is a test !", "No , it can ' t be"]
    
    Args:
    data - list of strings containing the text to tokenize
    tokenizer - nltk tokenizer
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
tokenizer = WordPunctTokenizer()
data_tok = apply_tokenizer(X_train, tokenizer)

assert len(data_tok) == 1530
assert isinstance (data_tok, list)
assert all(isinstance(sentence, str) for sentence in data_tok)

assert len([w for s in data_tok for w in s.split(" ")]) == 665508
assert hashlib.sha256(data_tok[100].encode()).hexdigest() == \
    '80a21ceb29cbdfda6f15e4c2def79894ca64f4231a8a9a27dbe9805d7c1a8c55'
assert hashlib.sha256(data_tok[875].encode()).hexdigest() == \
    '39c7f9fa57ecc57060c6db546fbdb4a2de201e41c6a6a466286c977039e94fdd'

#### Q2.b)

The second step you will implement is lowercasing the data.

In [None]:
def apply_lowercase(data):
    """
    Returns a list of strings, with all the tokens lowercased.
    
    Args:
    data - list of strings to be lowercased
    """

    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
data_tok_lc = apply_lowercase(data_tok)

assert len(data_tok_lc) == 1530
assert isinstance (data_tok_lc, list)
assert all(isinstance(sentence, str) for sentence in data_tok_lc)
assert not any(c.isupper() for s in data_tok_lc for w in s.split(" ") for c in w)

assert len([w for s in data_tok_lc for w in s.split(" ")]) == 665508
assert hashlib.sha256(data_tok_lc[56].encode()).hexdigest() == \
    '1f5cf4311e169eb2f53d8b06f556362d06358a3259f302fa56bdf06dcd06136a'
assert hashlib.sha256(data_tok_lc[785].encode()).hexdigest() == \
    '4cb3727db93c102e455124d672acc00c629bb65f71a73a1a48f3ceb31619f9ff'

#### Q2.c)

Now implement a function that filters the stopwords. We will use NLTK's built-in English stopword list.

In [None]:
stopword_list = stopwords.words('english')

In [None]:
def apply_filter_stopwords(data, stopword_list):
    """
    Returns a list of strings, where the strings do not contain any of
        the stopwords in the given list.
    
    Args:
    data - list of strings to filter stopwords from
    stopword_list - list of stopwords to filter out
    """
    
    # Filter the stopwords from the text
    # data_no_stopwords = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return data_no_stopwords


In [None]:
data_tok_lc_nosw = apply_filter_stopwords(data_tok_lc, stopword_list)

assert len(data_tok_lc_nosw) == 1530
assert isinstance (data_tok_lc_nosw, list)
assert all(isinstance(sentence, str) for sentence in data_tok_lc_nosw)
assert not any(any(w in stopword_list for w in s.split(" ")) for s in data_tok_lc_nosw)

assert len([w for s in data_tok_lc_nosw for w in s.split(" ")]) == 472520
assert hashlib.sha256(data_tok_lc_nosw[325].encode()).hexdigest() == \
    '4b114a2e69b98612a8d5e6b47fc41411d3414f3a9f21740cc75eb9451787cf01'
assert hashlib.sha256(data_tok_lc_nosw[4].encode()).hexdigest() == \
    'ed242f390891d3f8dd5ef8b27f8f5e7031d8b3338c2790a7989c8bb0928fed9c'

#### Q2.d)

After filtering stopwords, we want to remove punctuation from the text as well. Consider only the tokens in `string.punctuation` to be single punctuation characters. Make sure to remove all punctuation and not only tokens that are single punctuation characters. 

_Hint: check the note on punctuation in Part 2 of the learning notebooks_

In [None]:
def apply_filter_punct(data):
    """
    Returns a list of tokenized sentences with no punctuation.
    
    Args:
    data - list of tokenized sentences from which to remove punctuation
    """

    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
data_tok_lc_nosw_nopunct = apply_filter_punct(data_tok_lc_nosw)

#### Normalize whitespaces

Run the following function on `data_tok_lc_nosw_nopunct` before checking your answers, in case extra whitespaces cause the asserts to fail:

In [None]:
def normalize_whitespace(data):
    return [re.sub(r"^\s+|\s+$|(?<=\s)\s*", "", text) for text in data]

data_tok_lc_nosw_nopunct_norm = normalize_whitespace(data_tok_lc_nosw_nopunct)

In [None]:
assert len(data_tok_lc_nosw_nopunct_norm) == 1530
assert len([w for s in data_tok_lc_nosw_nopunct_norm for w in s.split(" ")]) == 379730
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm[74].encode()).hexdigest() == \
    '377fd8129ab959bb9b184093e6afe2a7812a579484bd63fdc28245736c37e442'
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm[965].encode()).hexdigest() == \
    '8ea25acf67aa0a5a36d5733f72cc7c3f345336c8044f390bee3b63ad31f8ac15'

#### Q2.e)

The last preprocessing step you are going to implement is stemming. Implement the function to receive an NLTK-style stemmer and return the text as a string with the stemmer applied.

In [None]:
def apply_stemmer(data, stemmer):
    """
    Returns a list of strings, with stemmed data.
    
    Args:
    data - list with text to stem
    stemmer - instance of stemmer to use
    """

    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
stemmer = SnowballStemmer("english")
data_tok_lc_nosw_nopunct_norm_stem = apply_stemmer(data_tok_lc_nosw_nopunct_norm, stemmer)

In [None]:
stemmer = SnowballStemmer("english")
data_tok_lc_nosw_nopunct_norm_stem = apply_stemmer(data_tok_lc_nosw_nopunct_norm, stemmer)

assert len(data_tok_lc_nosw_nopunct_norm_stem) == 1530
assert len([w for s in data_tok_lc_nosw_nopunct_norm_stem for w in s.split(" ")]) == 379730
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm_stem[854].encode()).hexdigest() == \
    '59b37c27fe05b0e3d20df96df763b82dca7ec28fe2bd0164c10d257db852a63d'
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm_stem[21].encode()).hexdigest() == \
    'c116aa8dbd9c3f4785aae448aad52c2e9b2d9fd5b0f3b28e451affd8f3e3e8f1'

#### Q2.f)

Finally, join everything in a function, that applies the steps in the following order:
* Tokenization
* Lowercasing
* Filtering stopwords
* Filtering punctuation
* Normalizing whitespace
* Stemming

Make use of the functions you designed above when filling the transformer below.


In [None]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, lower=True, remove_punct=True, stopwords=[], stemmer=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.lower = lower
        self.remove_punct = remove_punct
        self.stopwords = stopwords
    
    def clean_sentences(self, data):
                
        # Tokenize sentence so it each sentence contains spaced words or tokens
        # sentences_preprocessed = ...
        # YOUR CODE HERE
        raise NotImplementedError()
        
        # Lowercase
        if self.lower:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()

        if self.stopwords:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()
            
        # Remove punctuation
        if self.remove_punct:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()
        
        # Normalize whitespace
        # sentences_preprocessed = ...
        # YOUR CODE HERE
        raise NotImplementedError()
    
        # Stem words
        if self.stemmer:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()

        return sentences_preprocessed


In [None]:
text_cleaner = TextCleanerTransformer(
    WordPunctTokenizer(),
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords.words('english'),
    stemmer=SnowballStemmer("english"),
)

X_train_pre = text_cleaner.clean_sentences(X_train)

In [None]:
assert len(X_train_pre) == 1530
assert len([w for s in X_train_pre for w in s.split(" ")]) == 379730
assert X_train_pre[1352][:150] == ('traffic manag web mobil mx mex mexico nan companyadcash ® intern advertis network deliv billion ad unit month reach sever million peopl around globe s')
assert X_train_pre[342][:150] == ('physic therapist us ca orovill nan welcom interfac rehabinterfac rehab provid comprehens rehabilit physic occup amp speech therapi amp consult servic ')

# Q3. Text classification

We will now use what we've learned to try to classify the job postings as fake or real. Let's first load the preprocessed data and double-check the balance of the classes.

We are loading the preprocessed csv file here. This way you won't be penalized if you were not able to finish the preprocessing part.

In [None]:
def load_dataset(file_name):
    """
    Loads a csv file and returns two lists, one
    containing only the text and one containing the labels
    
    Args:
    file_name: path to input file
    """
    df = pd.read_csv(file_name, index_col = 0)

    return list(df['description']), list(df['fraudulent'])

In [None]:
X_train_pre, y_train = load_dataset('data/job_postings_train_preprocessed.csv')
X_dev_pre, y_dev = load_dataset('data/job_postings_dev_preprocessed.csv')
X_test_pre, y_test = load_dataset('data/job_postings_test_preprocessed.csv')

In [None]:
get_data_stats(X_train_pre, y_train)

In [None]:
get_data_stats(X_dev_pre, y_dev)

So, we should be aiming for much better than 45% accuracy, which is what we would get if we naively predicted `1` (fake) for everything.

#### Q3.a)

First, we'll look at the top X ngrams in each category to see if anything is interesting. Write a function that returns the most common n-grams and their count for a specific label in our dataset.


In [None]:
def top_ngrams_for_category(text, labels, filter_label, top_n=10, ngram_size=1):
    """
    Filters the data to the desired label, constructs a counter of ngrams
    Returns the top n ngrams
    
    Args:
    text: list of text strings to get ngrams from
    labels: categories corresponding to text
    filter_label: the label to filter the data on before getting ngrams
    top_n: top n ngrams to return
    ngram_size: the "n" in ngram (e.g. if ngram_size=2, return only bigrams)
    """
    # First, filter text to desired category
    # text_filtered = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Create list of ngrams
    # ngram_list = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Count occurances of each ngram
    # hint: use collections.Counter
    # ngram_counter = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # return top_n most common ngrams
    # hint: use a method of collections.Counter
    # return ...
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
top_10_unigrams_real = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=10, ngram_size=1)
assert top_10_unigrams_real == [(('nan',), 3199),
                                    (('work',), 2689),
                                    (('experi',), 1966),
                                    (('manag',), 1854),
                                    (('team',), 1830),
                                    (('servic',), 1773),
                                    (('develop',), 1706),
                                    (('custom',), 1624),
                                    (('compani',), 1430),
                                    (('time',), 1428)]
top_6_unigrams_fake = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=6, ngram_size=1)
assert top_6_unigrams_fake == [(('nan',), 3318),
                                  (('work',), 1828),
                                  (('manag',), 1303),
                                  (('experi',), 1287),
                                  (('time',), 1221),
                                  (('servic',), 1187)]
top_5_bigrams_real = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=5, ngram_size=2)
assert top_5_bigrams_real == [(('nan', 'nan'), 1349),
                                  (('full', 'time'), 745),
                                  (('custom', 'servic'), 398),
                                  (('bachelor', 'degre'), 308),
                                  (('year', 'experi'), 211)]
top_10_trigrams_fake = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=10, ngram_size=3)
assert top_10_trigrams_fake == [(('nan', 'nan', 'nan'), 851),
                                 (('nan', 'full', 'time'), 165),
                                 (('time', 'nan', 'nan'), 138),
                                 (('high', 'school', 'equival'), 134),
                                 (('full', 'time', 'nan'), 131),
                                 (('oil', 'gas', 'industri'), 123),
                                 (('time', 'entri', 'level'), 116),
                                 (('full', 'time', 'entri'), 104),
                                 (('level', 'high', 'school'), 92),
                                 (('mid', 'senior', 'level'), 89)]

Looking at the top ngrams for each category, it doesn't seem like a BoW model will be very interesting, but let's try anyway.

#### Q3.b)
To begin, let's streamline our pipeline in a nice function. We'll use sklearn's `CountVectorizer` instead of the function we wrote. We'll also use the `LogisticRegression` classifier to make predictions on the dev set.

In [None]:
def train_and_validate(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed articles in training data
    X_dev - preprocessed articles in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    """
    
    # Build the pipeline containing the countvectorizer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    
    # acc = (...)
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # print the classification report
    print(classification_report(y_dev, y_dev_pred))

    # return text_clf, y_dev_pred, acc
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev)

# check same as before
assert_allclose(clf['clf'].intercept_, np.array([0.73381536]), rtol=1e-3)
assert ' '.join(str(i) for i in y_dev_pred[:20]) == "0 0 1 1 0 0 0 0 1 1 0 1 1 1 0 1 1 1 0 0"
assert hashlib.sha256(' '.join(str(i) for i in y_dev_pred).encode()).hexdigest() == \
    "7ed40ca221aaaf17d3647bd204045ae4799558b377e8ec37fbc464ee9731113a"
assert_allclose(acc, 0.90, rtol=1e-2)

In [None]:
# we should also look at some misclassified examples
for text, pred, true in zip(X_dev_pre[:70], y_dev_pred[:70], y_dev[:70]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

So just with the simplest BoW model we already get an accuracy of 0.9o! But let's see if we can do even better... In the misclassified examples, the last one even contains a strange url but classified as 'fake'. That's suspicious...

#### Q3.c)
Run the pipeline for different ngram ranges and/or with different values for max_features. Try to achieve an accuracy higher or equal to 0.9

In [None]:
# clf,y_dev_pred, acc = train_and_validate(...)
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert(acc >= 0.90)

Now evaluate your model on the test set!

In [None]:
X_test_vec = clf['vect'].transform(X_test_pre)
y_test_pred = clf['clf'].predict(X_test_vec)
print(classification_report(y_test, y_test_pred))

Great! We were able to improve the model a little on the dev set and the performance on the test set is pretty good!

## Q4. TF-IDF

Similarly to how we found the top ngrams before we started working with BoW, we will now find the most important unigrams, inverse weighted by document frequency.

**Note**: Throughout this exercise, we'll use this term - **most important** - to refer to the unigrams of the highest weight, in particular within each class, since these will be transferred into vectors used for training. It is not 100% true these features will necessarily be "the most important", but in general they will be relevant, especially if your dataset is correctly processed, so we'll use this expression as a proxy.

#### Q4.a)

First, implement TF-IDF on a dataframe representing a Bag of Words model of the data. Here is a reminder of the TF-IDF formula:

$$ tfidf _{t, d} =(log{(1 + tf_{t,d})})*(log{(1 + \frac{N}{df_{t}})})  $$


In [None]:
# We'll start you off with the BoW representation, in pandas dataframe format
vec = CountVectorizer()
BoW_train = vec.fit_transform(X_train_pre)
BoW_train_df = pd.DataFrame(BoW_train.todense())

In [None]:
def tfidf(BoW_df):
    """
    Returns pandas dataframe of a tfidf representation from a BoW representation dataframe.

    Args:
    BoW_df - dataframe with document word counts (Bag of Words)
    """
    # remember that the BoW representation is raw counts, it is not normalized by the length of each text
    # first transform the df into term frequencies, where the counts are normalized
    # also double check the formula above for additional transformations applied to the tf expression
    # use np.log(x) for natural base log
    # tf = (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    # now we need a function that computes the idf side of the expression
    # it operates over a column (a word in the vocab), where the column contains each doc's count of that word
    # def _idf(column):
    #   return (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    # now weight the term frequencies by the idfs
    # tf_idf = (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    return tf_idf

In [None]:
tfidf_df = tfidf(BoW_train_df)
assert math.isclose(tfidf_df.iloc[1525, 1], 0.0175562, abs_tol=0.0001)
assert math.isclose(tfidf_df.iloc[0, 2505], 0.003557, abs_tol=0.0001)
assert math.isclose(tfidf_df.iloc[344, 3405], 0.0194850, abs_tol=0.0001)

### Q4.b)

Now that we have our TF-IDF representation, we can proceed with getting the most important words per category. 

Let's write a small helper function first to get the vocabulary in the right format.

In [None]:
# get the vocab from CountVectorizer, which is of the format {"word": idx, ...}
vocab_word_2_idx = vec.vocabulary_

In [None]:
# write a function to convert this vocab to the format {idx: "word", ...}

def reverse_vocab(vocab_word_to_index):
    """
    Converts a vocabulary dictionary with words as keys and indices as values to a 
        new dictionary with indices as keys and words as values
    
    Args:
    vocab_word_to_index: vocabulary dict of the format {"word": 0, "hello": 1, ...}
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
vocab_idx_2_word = reverse_vocab(vocab_word_2_idx)
assert len(vocab_idx_2_word) == 18647
assert vocab_idx_2_word[1231] == "amount"
assert vocab_idx_2_word[1639] == "assimil"
assert vocab_idx_2_word[9876] == "memor"

### Q4.c)

Finally, write a function to return the list of N most important words in each category according to TF-IDF.

In [None]:
def top_tfidf_words_for_category(tfidf_df, labels, filter_label, top_n=10):
    """
    Returns the top n most important words for the given label, with the given vocabulary
    and corresponding tfidf representation of some text data
    
    Args:
    tfidf_df: a dataframe of the tfidf representation of some text (columns=words, rows=documents)
    labels: categories corresponding to documents in tfidf_df
    filter_label: the label to filter the data on before getting top n words
    top_n: top n words to return
    """
    # First, filter tfidf to desired category
    # tfidf_filt = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Get the top n words of the current label, according to tfidf
    # There are several ways to do this, but here are some hints
    # 1) Sum the filtered df to get the total value per word
    # 2) Sort
    # 3) Replace indices with words and return the top_n
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
top_15_fake = top_tfidf_words_for_category(tfidf_df, y_train, 1, top_n=15)
assert top_15_fake == ['nan',
                         'work',
                         'time',
                         'entri',
                         'servic',
                         'custom',
                         'home',
                         'posit',
                         'experi',
                         'skill',
                         'amp',
                         'manag',
                         'administr',
                         'requir',
                         'start']
top_20_real = top_tfidf_words_for_category(tfidf_df, y_train, 0, top_n=20)
assert top_20_real == ['nan',
                       'work',
                       'develop',
                       'experi',
                       'manag',
                       'sale',
                       'custom',
                       'team',
                       'servic',
                       'product',
                       'job',
                       'market',
                       'client',
                       'design',
                       'busi',
                       'compani',
                       'time',
                       'technolog',
                       'nbsp',
                       'web']

### Q4.d)

Now, we will put everything together. Rewrite the train_and_validate function from Q3.b but using sklearn's `TfIdfTransformer`. Also, add kwargs for CountVectorizer's `max_df` and `min_df`.

In [None]:
def train_and_validate_with_tfidf(X_train, X_dev, y_train, y_dev, ngram_range=(1,1),
                                  max_features=None, max_df=1.0, min_df=1):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed articles in training data
    X_dev - preprocessed articles in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    max_df = max_df for CountVectorizer (int or float)
    min_df = min_df for CountVectorizer (int or float)
    """
    
    # Build a pipeline containing the countvectorizer, tfidftransformer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    # print the classification report
    # acc = (...)
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    print(classification_report(y_dev, y_dev_pred))

    # return text_clf, y_dev_pred, acc
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
clf, y_dev_pred, acc = train_and_validate_with_tfidf(X_train_pre, X_dev_pre, y_train, y_dev)

assert hashlib.sha256(" ".join([str(x) for x in list(y_dev_pred)]).encode()).hexdigest() == \
    "a63704cd531a94f5299f67f20b1e781e41f2f890825dee89fee6a4b536c05024"
assert_allclose(acc, 0.91, rtol=1e-3)

In [None]:
# As before, we should also look at some misclassified examples
for text, pred, true in zip(X_dev_pre[:70], y_dev_pred[:70], y_dev[:70]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Unfortunately, we're still not able to correctly classify those sentences. Let's keep going and see if we can do even better.

#### Q4.e)

Use the `train_and_validate_with_tfidf` function you created before to train with different hyperparameters and get an accuracy score above 92% on the validation dataset. (This threshold is the same as what we got for plain CountVectorizer)


In [None]:
# clf, _, acc = train_and_validate_with_tfidf(...)
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert(acc > 0.92)

Now evaluate your model on the test set!

In [None]:
X_test_vec = clf['tfidf'].transform(clf['vect'].transform(X_test_pre))
y_test_pred = clf['clf'].predict(X_test_vec)
print(classification_report(y_test, y_test_pred))

Great results! We were able to slightly improve the performance on the test set compared with using BoW with n-grams ranging from 1 to 3.