In [1]:
#Multi-class classification 
# Feature Extraction using tfidf
# I have followed a nouns-only approach so that only nouns from the articles are used for forming tfidf vectors. 
# Nouns only approach tends to give better results as compared to standard tfidf
# Performance of various classifiers have been compared

In [21]:
import pandas as pd
import nltk
import numpy as np
from os import listdir
import re
from sklearn.cross_validation import cross_val_score
import codecs

#feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

#classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

In [9]:
#data preprocessing
loc='C:/Users/Ruchita/Desktop/document classification/train_data'
categories=listdir(loc)
#categories=['protest_data','non_protest']
categories.remove('non_protest')
#categories.remove('README.TXT')
print(categories)
#print(categories)
all_data=pd.DataFrame(columns=['Doc','label'])
docs=[]
labels=[]

wnl=WordNetLemmatizer()
stop=set(stopwords.words('english'))

for ct in categories:
    floc=loc+"/"+ct
    j=listdir(floc)
    for i in j:
        file=codecs.open(floc+"/"+i,'r')
        txt=file.read()
            
        try:
            lst=txt.strip().split()
    
            newlst=lst[:lst.index('[â€¦]')]
            s=' '
            txt=s.join(newlst)
                
        except:
            txt=txt
        
        # only include nouns from sentences for making tfidf vectors
        lst2=[word for (word,tag) in pos_tag(txt.strip().split()) if re.match(r'NN',tag)!=None]
        
        s=' '
        txt=s.join(lst2)
        
        txt=txt.lower()
        
        #removal of punctuations
        
        txt2=re.sub(r'[^a-z ]','',txt)
        #months=['january','february','march','april','may','june','july','august','september','october','november','december']
    
        #removal of stopwords
        chrlst=[wnl.lemmatize(u) for u in txt2.strip().split() if (u not in stop)]
        
        txt=s.join(chrlst)
            
        docs.append(txt)
        labels.append(ct)
        file.close()
        
            
        
        
        
all_data['Doc']=docs
all_data['label']=labels
all_data.head()

['boycott', 'demonstrations', 'hunger_strike', 'lawsuit', 'non_protest1', 'riots']


Unnamed: 0,Doc,label
0,representative trondheim norway city good serv...,boycott
1,bd committee thursday evening news un human ri...,boycott
2,man israeli force thursday morning guard park ...,boycott
3,khader adnan prisoner hunger striker release d...,boycott
4,israel eu plan settlement product eu member st...,boycott


In [10]:
all_data=all_data.iloc[np.random.permutation(len(all_data))]
all_data=all_data.reset_index(drop=True)
x=all_data['Doc']
y=all_data['label']
count_vect = CountVectorizer()
x_counts = count_vect.fit_transform(x)
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

## training using various classifiers (k fold cross validation)

### Multinomial Naive Bayes

In [13]:
# 10 fold cross validation 
clfMNB = MultinomialNB()
scoresMNB = cross_val_score(clfMNB,x_tfidf , y, cv=10)
print(scoresMNB)

array([ 0.8028169 ,  0.73239437,  0.77142857,  0.81428571,  0.82857143,
        0.85507246,  0.76119403,  0.81818182,  0.78787879,  0.78461538])

### Decision Tree Classifier

In [30]:
# 10 fold cross validation 
clfDT = DecisionTreeClassifier()
scoresDT = cross_val_score(clfDT,x_tfidf , y, cv=10)
scoresDT

array([ 0.85915493,  0.94366197,  0.9       ,  0.92857143,  0.97142857,
        0.91304348,  0.8358209 ,  0.89393939,  0.89393939,  0.93846154])

### Random Forest Classifier

In [31]:
# 10 fold cross validation 
clfRF = RandomForestClassifier()
scoresRF = cross_val_score(clfRF,x_tfidf , y, cv=10)
scoresRF

array([ 0.87323944,  0.91549296,  0.87142857,  0.92857143,  0.95714286,
        0.94202899,  0.85074627,  0.87878788,  0.87878788,  0.87692308])

### Gaussian Naive Bayes

In [27]:
# 10 fold cross validation 
clfGNB = GaussianNB()
scoresGNB = cross_val_score(clfGNB,x_tfidf.toarray() , y, cv=10)
scoresGNB

array([ 0.71830986,  0.71830986,  0.58571429,  0.77142857,  0.75714286,
        0.79710145,  0.76119403,  0.63636364,  0.77272727,  0.70769231])

### KNeighbours Classifier

In [26]:
# 10 fold cross validation 
clfK = KNeighborsClassifier()
scoresK = cross_val_score(clfK,x_tfidf , y, cv=10)
scoresK

array([ 0.71830986,  0.69014085,  0.82857143,  0.82857143,  0.82857143,
        0.7826087 ,  0.74626866,  0.6969697 ,  0.81818182,  0.75384615])