# Document Classification with SVM

### First we import everything we need

In [21]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
import sys
import pandas as pd
sys.path.append('/Users/brandonwatts/Desktop/VCU-VIP-Nanoinformatics/Tools')
from TextTools import build_data_frame, clean_text

### Next we will define a mapping that will map an entity label to a directory. 

In [2]:
mapping_df = pd.read_csv('../../Data/mapping.csv')
mapping_df

Unnamed: 0,Label,Directory
0,NANOINF,../../Data/NanoinformaticsTXT
1,ENV,../../Data/EnvironmentTXT
2,SYN,../../Data/SynthesisTXT
3,TOX,../../Data/ToxicologyTXT


### Now we can obtain the documents from thier directory and place them in a DataFrame.

In [8]:
def getDocumentsFrom(mapping_df):
    documents_array = []
    for index, row in mapping_df.iterrows():
        documents_array.append(build_data_frame(row["Directory"], row['Label']))
    return pd.concat(documents_array)

documents = getDocumentsFrom(mapping_df)
documents.head()

Unnamed: 0,class,text
../../Data/NanoinformaticsTXT/Tang_International Journal of Nanomedicine_2013.txt,NANOINF,tang al publisher licensee dove medical press ...
../../Data/NanoinformaticsTXT/Kostoff_J.txt,NANOINF,data mining tomography office naval research n...
../../Data/NanoinformaticsTXT/Chiesa_Maojo_INVE_MEM_2008.txt,NANOINF,part berlin building index automatic approach ...
../../Data/NanoinformaticsTXT/Liu_Cohen_CompSciDisc_2013.txt,NANOINF,screening data analysis institute university c...
../../Data/NanoinformaticsTXT/Oksel_Particuology_2015.txt,NANOINF,article content available jo ur home page loca...


### Our documents are in order so we need to shuffle them

In [9]:
def shuffle(docs):
    return docs.reindex(np.random.permutation(docs.index))

documents = shuffle(documents)
documents.head()

Unnamed: 0,class,text
../../Data/ToxicologyTXT/Takahashi_Nanotechnology_2006.txt,TOX,institute physic gold cell death department ph...
../../Data/SynthesisTXT/Li_J Am Chem Soc_2003.txt,SYN,synthesis nearly via successive ion layer adso...
../../Data/EnvironmentTXT/Du_RSC Adv.txt,ENV,paper pu ar ow de view article view journal vi...
../../Data/SynthesisTXT/Bakueva_Adv.txt,SYN,ie ie ie
../../Data/ToxicologyTXT/Lee_Small_2009.txt,TOX,toxicity testing cell culture toxicity toxicit...


### The text is junk and needs to be cleaned.

In [11]:
documents['text'] = documents['text'].map(lambda x: clean_text(x))
documents.head()

Unnamed: 0,class,text
../../Data/ToxicologyTXT/Takahashi_Nanotechnology_2006.txt,TOX,institute physic gold cell death department ph...
../../Data/SynthesisTXT/Li_J Am Chem Soc_2003.txt,SYN,synthesis nearly via successive ion layer adso...
../../Data/EnvironmentTXT/Du_RSC Adv.txt,ENV,paper pu ar ow de view article view journal vi...
../../Data/SynthesisTXT/Bakueva_Adv.txt,SYN,ie ie ie
../../Data/ToxicologyTXT/Lee_Small_2009.txt,TOX,toxicity testing cell culture toxicity toxicit...


### Define our training vectors 

In [12]:
X = documents.iloc[:, 1].values
X

array([ 'institute physic gold cell death department physic chemistry graduate school engineering university japan department applied chemistry faculty engineering university japan center future chemistry university japan received june final form august abstract combined use gold irradiation cell death laser irradiation also induced spherical since absorption region successive laser irradiation affect advantageous unwanted cell damage following destruction target introduction gold gold unique optical absorption spectrum show two correspond transverse longitudinal surface transverse band visible region around longitudinal band region thus unusual intense absorption band region previously laser irradiation onto longitudinal induced release recently al release enhancement gene expression key factor laser induced spherical photosensitization major research topic photochemistry provide new technology treatment combination light induce selective cell death leaving normal intact particular li

In [13]:
y  = documents.iloc[:, 0].values
y

array(['TOX', 'SYN', 'ENV', ..., 'ENV', 'TOX', 'TOX'], dtype=object)

### Define our Pipeine

In [14]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', SVC(kernel='rbf'))
])


### Perform Cross-Validation

In [23]:
NUMBER_OF_FOLDS = 10
scoreers = {
        "f1_scores": make_scorer(f1_score, average='weighted'),
        "precision_scores": make_scorer(precision_score, average='weighted'),
        "recall_scores": make_scorer(recall_score, average='weighted'),
    }
scores = cross_validate(pipeline, X, y, cv=NUMBER_OF_FOLDS,scoring=scoreers, n_jobs=-1)

f1_scores = scores['test_f1_scores']
precision_scores = scores['test_precision_scores']
recall_scores = scores['test_recall_scores']

for x in range(NUMBER_OF_FOLDS):
    print("Fold number: ", x)
    print("Precision: ", precision_scores[x])
    print("Recall: ", recall_scores[x])
    print("F1 Score: ", f1_scores[x])
    print("\n")

print("Averages Across Folds")
print("Precision: ", np.mean(np.array(precision_scores)))
print("Recall: ", np.mean(np.array(recall_scores)))
print("F1 Score: ", np.mean(np.array(f1_scores)))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold number:  0
Precision:  0.627093065038
Recall:  0.595041322314
F1 Score:  0.483419072696


Fold number:  1
Precision:  0.56128309287
Recall:  0.570247933884
F1 Score:  0.436515820845


Fold number:  2
Precision:  0.561853448276
Recall:  0.575
F1 Score:  0.448089976322


Fold number:  3
Precision:  0.628244837758
Recall:  0.6
F1 Score:  0.493966368053


Fold number:  4
Precision:  0.620186781609
Recall:  0.575
F1 Score:  0.446281414513


Fold number:  5
Precision:  0.587971230159
Recall:  0.6
F1 Score:  0.493580020699


Fold number:  6
Precision:  0.625511695906
Recall:  0.591666666667
F1 Score:  0.479731791848


Fold number:  7
Precision:  0.497270114943
Recall:  0.566666666667
F1 Score:  0.43332807156


Fold number:  8
Precision:  0.638986905042
Recall:  0.623931623932
F1 Score:  0.526742354329


Fold number:  9
Precision:  0.575977611376
Recall:  0.589743589744
F1 Score:  0.466074646511


Averages Across Folds
Precision:  0.592437878298
Recall:  0.588729780321
F1 Score:  0.470772