# Introducing the Naive Bayes Classifier

Now we will use annotated data to "learn" a sentiment classifier

In [3]:
# We first install the new dependency: nlpia (03_dit_coli_naivebayes.ipynb)
! pip install nlpia

Collecting nlpia
  Using cached nlpia-0.5.2-py2.py3-none-any.whl (32.0 MB)
Collecting pypandoc
  Using cached pypandoc-1.7.4.tar.gz (30 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pugnlp
  Using cached pugnlp-0.2.6-py2.py3-none-any.whl (706 kB)
Collecting keras
  Using cached keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
Collecting lxml
  Downloading lxml-4.8.0-cp38-cp38-win32.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 4.0 MB/s eta 0:00:00
Collecting pandas-datareader
  Using cached pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Collecting html2text
  Using cached html2text-2020.1.16-py3-none-any.whl (32 kB)
Collecting plotly
  Using cached plotly-5.6.0-py2.py3-no

  error: subprocess-exited-with-error
  
  Building wheel for h5py (pyproject.toml) did not run successfully.
  exit code: 1
  
  [71 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win32-3.8
  creating build\lib.win32-3.8\h5py
  copying h5py\ipy_completer.py -> build\lib.win32-3.8\h5py
  copying h5py\version.py -> build\lib.win32-3.8\h5py
  copying h5py\__init__.py -> build\lib.win32-3.8\h5py
  creating build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\attrs.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\base.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\compat.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\dataset.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\datatype.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\dims.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\files.py -> build\lib.win32-3.8\h5py\_hl
  copying h5py\_hl\filters.py -> build\lib.win32-3.8\h5py\_hl
  copying h5p

Collecting nlpia
  Using cached nlpia-0.5.2-py2.py3-none-any.whl (32.0 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.0.2-cp38-cp38-win32.whl (6.4 MB)
Collecting matplotlib
  Downloading matplotlib-3.5.1-cp38-cp38-win32.whl (7.1 MB)
     ---------------------------------------- 7.1/7.1 MB 4.1 MB/s eta 0:00:00
Collecting plotly
  Using cached plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
Collecting html2text
  Using cached html2text-2020.1.16-py3-none-any.whl (32 kB)
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
     ---------------------------------------- 50.5/50.5 KB 2.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pypandoc
  Using cached pypandoc-1.7.4.tar.gz (30 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: 

In [2]:
# Loading the dependencies
import pandas as pd
from collections import Counter

from nlpia.data.loaders import get_data 

# The casual tokenizer can handle emoticons, unusual punctuation and slang better than the TreeBank tokenizer
from nltk.tokenize import casual_tokenize

ModuleNotFoundError: No module named 'nlpia'

## Setting up the "corpus"

Loading the movies corpus from Hutto movies

In [15]:
movies = get_data('hutto_movies')

# Looking at some of the first instances
movies.head().round(2)

NameError: name 'get_data' is not defined

### Getting a description of the data (look at the range)

In [None]:
movies.describe().round(2)

In [None]:
# Helps display wide DataFrames in the console, so they look prettier
pd.set_option('display.width', 75)
movies.sentiment

### Loading the data into a DataFrame through a list of dictionaries

In [None]:
bags_of_words = []

for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

df_bows = pd.DataFrame.from_records(bags_of_words)

# from_records() is a DataFrame constructor.
# INPUT: a sequence (list) of dictionaries
# OUTPUT: a DF with columns for all the keys and associated values. 
# (Missing values become NaN!)
print(df_bows)

In [None]:
# So we fill them with 0:
df_bows = df_bows.fillna(0).astype(int)
print(df_bows)

### Let us look at the shape

Spoiler: A BoW can explode in size; even more when no normalisation is applied at all


In [None]:
df_bows.shape

Now, let us see the first instances (it is quite sparse)

In [None]:
df_bows.head()


**Homework**: Integrate the normalisation pipeline (lowercasing, stopwording and stemming or lemmatisation) and see how the dataframe gets affected

In [None]:
# write your code here
None

In [None]:
print(df_bows.head()[list(bags_of_words[0].keys())])
print(df_bows.head()[list(bags_of_words[1].keys())])

### Build the Naive Bayes classifier

All the data is now ready. Let us build a Multinomial NB.

Multinomial NB is suitable for discrete features (e.g., word counts for text classification). 

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# "Binarising" the classes
movies.sentiment > 0

Now we can train ("fit") our model

In [None]:
# We are converting the class from float to Boolean, 
# as this classifier only supports discrete labels 
nb = nb.fit(df_bows, movies.sentiment > 0)

### We have a model and we can predict!

In [None]:
# predict_proba() gets continious-value predictions.
# We multiply and subtract it to convert the output to range [-4,4]

#print(predictions[:10])
# TODO there seems to be an error in th ebook code. 
# predict_proba returns the scores for all the classes (2) and we aim at
# assigning only the one for the positive class. 
# I had to to the following trick instead of the original
# movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
predictions = nb.predict_proba(df_bows) * 8 - 4 
movies['predicted_sentiment'] = [x[1] for x in predictions]

movies

Now, we compute the Mean Absolut Error (MAE) "a measure of difference between two continuous variables"

In [None]:
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
# This is the mean absolute error (MAE)
round(movies.error.mean(), 2)

In [None]:
# abs(n)

# abs(5) -> 5
# abs(-34) -> 34
# abs(0) -> 0

Now, let us see some gold and predicted sentiments, together with the binary classification

In [None]:
# Gold standard is positive
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)

# Prediction is positive
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)

# Let us have an overview of gold standard vs prediction
movies['''sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'''.split()].head(8)

In [None]:
# And this is the percentage of "thumbs up" rating correctly predicted    
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)


## not bad at all!