# Naive Bayes Document Classification

### First we import everything we need

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
import sys
import pandas as pd
sys.path.append('/Users/brandonwatts/Desktop/VCU-VIP-Nanoinformatics/Tools')
from TextTools import build_data_frame, clean_text

### Next we will define a mapping that will map an entity label to a directory. 

In [3]:
mapping_df = pd.read_csv('../../Data/mapping.csv')
mapping_df

Unnamed: 0,Label,Directory
0,NANOINF,../../Data/NanoinformaticsTXT
1,ENV,../../Data/EnvironmentTXT
2,SYN,../../Data/SynthesisTXT
3,TOX,../../Data/ToxicologyTXT


### Now we can obtain the documents from thier directory and place them in a DataFrame.

In [5]:
def getDocumentsFrom(mapping_df):
    documents_array = []
    for index, row in mapping_df.iterrows():
        documents_array.append(build_data_frame(row["Directory"], row['Label']))
    return pd.concat(documents_array)

documents = getDocumentsFrom(mapping_df)
documents.head()

Unnamed: 0,class,text
../../Data/NanoinformaticsTXT/Tang_International Journal of Nanomedicine_2013.txt,NANOINF,tang al publisher licensee dove medical press ...
../../Data/NanoinformaticsTXT/Kostoff_J.txt,NANOINF,data mining tomography office naval research n...
../../Data/NanoinformaticsTXT/Chiesa_Maojo_INVE_MEM_2008.txt,NANOINF,part berlin building index automatic approach ...
../../Data/NanoinformaticsTXT/Liu_Cohen_CompSciDisc_2013.txt,NANOINF,screening data analysis institute university c...
../../Data/NanoinformaticsTXT/Oksel_Particuology_2015.txt,NANOINF,article contents available jo ur home page loc...


### Our documents are in order so we need to shuffle them

In [None]:
def shuffle(docs):
    return docs.reindex(np.random.permutation(docs.index))

documents = shuffle(documents)
documents.head()

Unnamed: 0,class,text
../../Data/SynthesisTXT/Dong_J Am Chem Soc_2007.txt,SYN,cellulose dong institute department wood scien...
../../Data/NanoinformaticsTXT/delaIglesia_Maojo_PLoS_2014.txt,NANOINF,machine learning approach identify clinical de...
../../Data/EnvironmentTXT/Zhu_Journal of Environmental Science and Health Part A_2008.txt,ENV,journal environmental science health part copy...
../../Data/ToxicologyTXT/Duffin_Inhalation Toxicology_2007.txt,TOX,inhalation toxicology copyright print effects ...
../../Data/SynthesisTXT/Imani_Nanoscale Res Lett_2015.txt,SYN,express open access growth novel urchin contro...


### The text is junk and needs to be cleaned.

In [None]:
documents['text'] = documents['text'].map(lambda x: clean_text(x))
documents.head()

### Define our training vectors 

In [None]:
X = documents.iloc[:, 1].values
X

In [None]:
y  = documents.iloc[:, 0].values
y

### Define our Pipeine

In [None]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

### Perform Cross-Validation

In [None]:
NUMBER_OF_FOLDS = 10
scoreers = {
        "f1_scores": make_scorer(f1_score, average='weighted'),
        "precision_scores": make_scorer(precision_score, average='weighted'),
        "recall_scores": make_scorer(recall_score, average='weighted'),
    }
scores = cross_validate(pipeline, X, y, cv=NUMBER_OF_FOLDS,scoring=scoreers, n_jobs=-1)

f1_scores = scores['test_f1_scores']
precision_scores = scores['test_precision_scores']
recall_scores = scores['test_recall_scores']

for x in range(NUMBER_OF_FOLDS):
    print("Fold number: ", x)
    print("Precision: ", precision_scores[x])
    print("Recall: ", recall_scores[x])
    print("F1 Score: ", f1_scores[x])
    print("\n")

print("Averages Across Folds")
print("Precision: ", np.mean(np.array(precision_scores)))
print("Recall: ", np.mean(np.array(recall_scores)))
print("F1 Score: ", np.mean(np.array(f1_scores)))