Project: Build a Classifer to Determine Whether a Text Was Written by Scientists or Ethicists

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
#NOTE: Text has not been lemmatized

meta = pd.read_csv('metadata.csv')
meta['eth_or_sci'] = [0 if r=='eth' else 1 if r=='sci' else 2 for r in meta['sub-type']]
meta['raw'] = [open(x).read() for x in meta['file_name']]
meta['no_nums'] = [re.sub('\d+', ' ', y) for y in meta['raw']]
meta['clean'] = [re.sub('\W+', ' ', a).lower() for a in meta['no_nums']]
meta['tokens'] = [nltk.word_tokenize(b) for b in meta['clean']]

meta.head()

Unnamed: 0,type,sub-type,year,country,author,title,word_count,file_name,eth_or_sci,raw,no_nums,clean,tokens
0,gov,unclear,2015,international,international summit on human genome editing,statement of the first summit on human genome ...,957,First Summit Statement.txt,2,On Human Gene Editing:\nInternational Summit S...,On Human Gene Editing:\nInternational Summit S...,on human gene editing international summit sta...,"[on, human, gene, editing, international, summ..."
1,gov,eth,2015,europe,council of europe committee on bioethics,statement on genome editing technologies,626,Council of Europe Bioethics.txt,0,"\n\n \n\n \n\nStrasbourg, 2 December 2015 DH-...","\n\n \n\n \n\nStrasbourg, December DH-BIO...",strasbourg december dh bio inf coe logo fil b...,"[strasbourg, december, dh, bio, inf, coe, logo..."
2,gov,unclear,2015,germany,berlin-brandenburg academy of sciences and hum...,human genome surgery – towards a responsible e...,6502,Brandenburg.txt,2,HUMAN GENOME SURGERY � \nTOWARDS A RESPONSIBLE...,HUMAN GENOME SURGERY � \nTOWARDS A RESPONSIBLE...,human genome surgery towards a responsible eva...,"[human, genome, surgery, towards, a, responsib..."
3,gov,sci,2015,germany,german national academy of sciences,statement. The opportunities and limits of gen...,4761,Leopoldina.txt,1,The opportunities and limits of genome editinT...,The opportunities and limits of genome editinT...,the opportunities and limits of genome editint...,"[the, opportunities, and, limits, of, genome, ..."
4,gov,eth,2015,international,unesco international bioethics committee,report of the ibc on updating its reflection o...,17046,UNESCO.txt,0,\n\nDistribution: limited \n\nSHS/YES/IBC-22/...,\n\nDistribution: limited \n\nSHS/YES/IBC- / ...,distribution limited shs yes ibc rev paris oc...,"[distribution, limited, shs, yes, ibc, rev, pa..."


In [3]:
def build_sentences(type):
    sentences_list = []
    if type == 'eth': 
        eth_vals = meta[meta['eth_or_sci'] == 0].reset_index()
        for index in range(len(eth_vals)):
            sentences = nltk.sent_tokenize(eth_vals['no_nums'][index])
            for sentence in sentences: 
                sentence_clean = re.sub('\W+', ' ', sentence).lower()
                sentences_list.append(sentence_clean)
        return sentences_list
    if type == 'sci': 
        sci_vals = meta[meta['eth_or_sci'] == 1].reset_index()
        for index in range(len(sci_vals)):
            sentences = nltk.sent_tokenize(sci_vals['no_nums'][index])
            for sentence in sentences: 
                sentence_clean = re.sub('\W+', ' ', sentence).lower()
                sentences_list.append(sentence_clean)
        return sentences_list

eth_sentences = build_sentences('eth')
sci_sentences = build_sentences('sci')
X = eth_sentences + sci_sentences
Y = [0] * len(eth_sentences) + [1] * len(sci_sentences)
train_x, test_x, train_y, test_y = model_selection.train_test_split(X, Y, test_size=0.2)

print(len(train_x), len(test_x), len(train_y), len(test_y))

17580 4396 17580 4396


In [6]:
text_file = open("gov_eth_sentences.txt", "wt")
n = text_file.write('\n'.join(eth_sentences))
text_file.close()

text_file = open('gov_sci_sentences.txt', 'wt')
n = text_file.write('\n'.join(sci_sentences))
text_file.close()

In [29]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(meta['clean'])

Train_X_Tfidf = Tfidf_vect.transform(train_x)
Test_X_Tfidf = Tfidf_vect.transform(test_x)

print('\n\n\n\n\n\n')










In [32]:
#Using Naive Bayes Classifier

Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf, train_y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, test_y)*100)


#Using Support Vector Machine

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, train_y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_y)*100)

Naive Bayes Accuracy Score ->  76.52411282984531
SVM Accuracy Score ->  77.88898999090081
