# Text/Document Classification - Gender Identification

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
print("Total number of records:",len(labeled_names))
import random
random.shuffle(labeled_names)

Total number of records: 7944


In [3]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]

In [4]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Neo is a", classifier.classify(gender_features('Neo')))
print("Annie is a", classifier.classify(gender_features('Annie')))
print("\nThe accuracy is equal to: ", nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

Neo is a male
Annie is a female

The accuracy is equal to:  0.76
Most Informative Features
             last_letter = 'a'            female : male   =     35.7 : 1.0
             last_letter = 'k'              male : female =     31.7 : 1.0
             last_letter = 'f'              male : female =     17.3 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


In [5]:
from nltk.classify import apply_features
train_set_2 = apply_features(gender_features, labeled_names[2000:])
test_set_2 = apply_features(gender_features, labeled_names[:2000])
classifier_2 = nltk.NaiveBayesClassifier.train(train_set_2)
print("Neo is a", classifier_2.classify(gender_features('Neo')))
print("Annie is a", classifier_2.classify(gender_features('Annie')))
print("\nThe accuracy is equal to: ", nltk.classify.accuracy(classifier_2, test_set_2))
classifier_2.show_most_informative_features(5)

Neo is a male
Annie is a female

The accuracy is equal to:  0.752
Most Informative Features
             last_letter = 'a'            female : male   =     32.0 : 1.0
             last_letter = 'k'              male : female =     22.9 : 1.0
             last_letter = 'f'              male : female =     16.2 : 1.0
             last_letter = 'p'              male : female =     12.1 : 1.0
             last_letter = 'm'              male : female =     10.1 : 1.0


In [6]:
# Using Sklearn package to split the dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from sklearn.model_selection import train_test_split
train, test = train_test_split(featuresets, test_size=0.2, random_state=123)
len(train)
len(test)

# Naive Bayes with NLTK
import nltk
classifier_NB = nltk.NaiveBayesClassifier.train(train)
print("Neo is a", classifier_NB.classify(gender_features('Neo')))
print("Annie is a", classifier_NB.classify(gender_features('Annie')))
print("\nThe accuracy of NB is equal to: ", nltk.classify.accuracy(classifier_NB, test))
classifier_NB.show_most_informative_features(20)


Neo is a male
Annie is a female

The accuracy of NB is equal to:  0.7652611705475142
Most Informative Features
             last_letter = 'k'              male : female =     36.0 : 1.0
             last_letter = 'a'            female : male   =     34.1 : 1.0
             last_letter = 'f'              male : female =     15.1 : 1.0
             last_letter = 'u'              male : female =     12.9 : 1.0
             last_letter = 'p'              male : female =      9.8 : 1.0
             last_letter = 'v'              male : female =      9.8 : 1.0
             last_letter = 'd'              male : female =      9.3 : 1.0
             last_letter = 'o'              male : female =      9.3 : 1.0
             last_letter = 'w'              male : female =      8.4 : 1.0
             last_letter = 'm'              male : female =      7.7 : 1.0
             last_letter = 'r'              male : female =      6.5 : 1.0
             last_letter = 'z'              male : female =     

In [7]:
# Decision Tree with NLTK
classifier_DT = nltk.classify.DecisionTreeClassifier.train(train, entropy_cutoff=0, support_cutoff=0)
print("Neo is a", classifier_DT.classify(gender_features('Neo')))
print("Annie is a", classifier_DT.classify(gender_features('Annie')))
print("\nThe accuracy of DT is equal to: ", nltk.classify.accuracy(classifier_DT, test))
print(classifier_DT)

Neo is a male
Annie is a female

The accuracy of DT is equal to:  0.7652611705475142
last_letter= ? ........................................ female
last_letter=a? ........................................ female
last_letter=b? ........................................ male
last_letter=c? ........................................ male
last_letter=d? ........................................ male
last_letter=e? ........................................ female
last_letter=f? ........................................ male
last_letter=g? ........................................ male
last_letter=h? ........................................ female
last_letter=i? ........................................ female
last_letter=j? ........................................ male
last_letter=k? ........................................ male
last_letter=l? ........................................ male
last_letter=m? ........................................ male
last_letter=n? ....................................

In [8]:
# SklearnClassifier with NLTK
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
classifier_SKLC = SklearnClassifier(BernoulliNB()).train(train)
print("Neo is a", classifier_SKLC.classify(gender_features('Neo')))
print("Annie is a", classifier_SKLC.classify(gender_features('Annie')))
print("\nThe accuracy of SKLC is equal to: ", nltk.classify.accuracy(classifier_SKLC, test))


Neo is a male
Annie is a female

The accuracy of SKLC is equal to:  0.762114537444934


In [9]:
# SVM with NLTK
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
classifier_SVC = SklearnClassifier(SVC(), sparse=False).train(train)
print("Neo is a", classifier_SVC.classify(gender_features('Neo')))
print("Annie is a", classifier_SVC.classify(gender_features('Annie')))
print("\nThe accuracy of SVC is equal to: ", nltk.classify.accuracy(classifier_SVC, test))


Neo is a male
Annie is a female

The accuracy of SVC is equal to:  0.7652611705475142


In [10]:
# MAXENT with NLTK
from nltk.classify import maxent
encoding = maxent.TypedMaxentFeatureEncoding.train(train, count_cutoff=3, alwayson_features=True)
classifier_MAX = maxent.MaxentClassifier.train(train, bernoulli=False, encoding=encoding, trace=0)
print("Neo is a", classifier_MAX.classify(gender_features('Neo')))
print("Annie is a", classifier_MAX.classify(gender_features('Annie')))
print("\nThe accuracy of MAXENT is equal to: ", nltk.classify.accuracy(classifier_MAX, test))
classifier_MAX.show_most_informative_features(20)


Neo is a male
Annie is a female

The accuracy of MAXENT is equal to:  0.7652611705475142
   5.900 last_letter=='c' and label is 'male'
  -5.020 last_letter=='a' and label is 'male'
  -4.329 last_letter=='k' and label is 'female'
  -3.021 last_letter=='u' and label is 'female'
  -3.021 last_letter=='f' and label is 'female'
  -2.368 last_letter=='v' and label is 'female'
  -2.368 last_letter=='p' and label is 'female'
   2.274 last_letter=='z' and label is 'male'
  -2.152 last_letter=='w' and label is 'female'
  -2.057 last_letter=='d' and label is 'female'
  -2.056 last_letter=='o' and label is 'female'
  -1.825 last_letter=='m' and label is 'female'
  -1.798 last_letter=='i' and label is 'male'
  -1.577 last_letter=='r' and label is 'female'
  -1.466 last_letter=='b' and label is 'female'
  -1.274 last_letter=='j' and label is 'female'
  -1.238 last_letter=='g' and label is 'female'
  -1.008 last_letter=='t' and label is 'female'
  -1.004 last_letter=='s' and label is 'female'
  -0.92

In [11]:
# Naive Bayes from textblob
from textblob.classifiers import NaiveBayesClassifier
classifier_TB = NaiveBayesClassifier(train)
print("Neo is a", classifier_TB.classify(gender_features('Neo')))
print("Annie is a", classifier_TB.classify(gender_features('Annie')))
print("\nThe accuracy of TB_NB is equal to: ", classifier_TB.accuracy(test))


Neo is a female
Annie is a female

The accuracy of TB_NB is equal to:  0.6375078665827565


In [12]:
# Overall Accuracies - Using Sklearn package to split the dataset
print("~~~~~~~~~~~~~~~~~~ ACCURACY VALUES ~~~~~~~~~~~~~~~~~~~~")
print("\nThe accuracy of NB  is equal to: ", nltk.classify.accuracy(classifier_NB, test))
print("\nThe accuracy of DT is equal to: ", nltk.classify.accuracy(classifier_DT, test))
print("\nThe accuracy of SKLC is equal to: ", nltk.classify.accuracy(classifier_SKLC, test))
print("\nThe accuracy of SVC is equal to: ", nltk.classify.accuracy(classifier_SVC, test))
print("\nThe accuracy of MAXENT is equal to: ", nltk.classify.accuracy(classifier_MAX, test))
print("\nThe accuracy of TB_NB is equal to: ", classifier_TB.accuracy(test))


~~~~~~~~~~~~~~~~~~ ACCURACY VALUES ~~~~~~~~~~~~~~~~~~~~

The accuracy of NB  is equal to:  0.7652611705475142

The accuracy of DT is equal to:  0.7652611705475142

The accuracy of SKLC is equal to:  0.762114537444934

The accuracy of SVC is equal to:  0.7652611705475142

The accuracy of MAXENT is equal to:  0.7652611705475142

The accuracy of TB_NB is equal to:  0.6375078665827565


# Text/Document Classification (Sentiment Analysis - Movie Review)

In [13]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

len(documents)

2000

In [14]:
for row in range(0, len(documents)-1):
    print(documents[row][1])
    
random.shuffle(documents)
print(documents[1])

neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg


(['apocalypse', 'now', ',', 'based', 'on', 'the', 'novel', '"', 'hearts', 'of', 'darkness', '"', ',', 'is', 'an', 'extremely', 'striking', ',', 'gripping', ',', 'and', 'horrifying', 'depiction', 'of', 'the', 'vietnam', 'war', 'from', 'another', 'angle', '.', 'a', 'long', 'debate', 'has', 'surrounded', 'this', 'movie', 'as', 'if', 'it', 'is', 'actually', 'an', 'anti', '-', 'war', 'film', '.', 'in', 'many', 'ways', ',', 'this', 'debate', 'could', 'go', 'either', 'way', '.', 'apocalypse', 'now', 'is', 'probably', 'one', 'of', 'the', 'most', 'memorable', 'vietnam', 'war', 'films', 'ever', 'made', '.', 'in', 'addition', ',', 'now', 'film', 'has', 'gone', 'to', 'the', 'extremes', 'that', 'this', 'film', 'does', '-', 'a', 'disturbing', 'look', 'at', 'the', 'corruption', 'and', 'terrifying', 'effects', 'of', 'the', 'most', 'devastating', 'war', 'of', 'recent', 'years', '.', 'apocalypse', 'now', 'follows', 'the', 'mission', 'captain', 'benjamin', 'l', '.', 'willard', ',', 'played', 'by', 'marti

In [15]:
all_words = []
for w in movie_reviews.words():
    wl = w.lower()
    all_words.append(wl)

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [16]:
print(all_words["bad"])
print(all_words["good"])
print(all_words["excellent"])

1395
2411
184


In [17]:
word_features = list(all_words)[:2000]
print(word_features)

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', 'one', 'of', 'the', 'guys', 'dies', 'but', 'his', 'girlfriend', 'continues', 'see', 'him', 'in', 'her', 'life', 'has', 'nightmares', 'what', "'", 's', 'deal', '?', 'watch', 'movie', '"', 'sorta', 'find', 'out', 'critique', 'mind', '-', 'fuck', 'for', 'generation', 'that', 'touches', 'on', 'very', 'cool', 'idea', 'presents', 'it', 'bad', 'package', 'which', 'is', 'makes', 'this', 'review', 'even', 'harder', 'write', 'since', 'i', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'with', 'your', 'head', 'such', '(', 'lost', 'highway', '&', 'memento', ')', 'there', 'are', 'good', 'ways', 'making', 'all', 'types', 'these', 'folks', 'just', 'didn', 't', 'snag', 'correctly', 'seem', 'have', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'so', 'problems', 'well', 'its', 'main', 'problem', 'simply', 'to

In [18]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [19]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

{'plot': True, ':': True, 'two': True, 'teen': True, 'couples': True, 'go': True, 'to': True, 'a': True, 'church': True, 'party': True, ',': True, 'drink': True, 'and': True, 'then': True, 'drive': True, '.': True, 'they': True, 'get': True, 'into': True, 'an': True, 'accident': True, 'one': True, 'of': True, 'the': True, 'guys': True, 'dies': True, 'but': True, 'his': True, 'girlfriend': True, 'continues': True, 'see': True, 'him': True, 'in': True, 'her': True, 'life': True, 'has': True, 'nightmares': True, 'what': True, "'": True, 's': True, 'deal': True, '?': True, 'watch': True, 'movie': True, '"': True, 'sorta': True, 'find': True, 'out': True, 'critique': True, 'mind': True, '-': True, 'fuck': True, 'for': True, 'generation': True, 'that': True, 'touches': True, 'on': True, 'very': True, 'cool': True, 'idea': True, 'presents': True, 'it': True, 'bad': True, 'package': True, 'which': True, 'is': True, 'makes': True, 'this': True, 'review': True, 'even': True, 'harder': True, 'wri




In [20]:
featureset = [(find_features(rev), category) for (rev, category) in documents]
len(featureset)
featureset[2]

train_set = featureset[100:]
len(train_set)

test_set = featureset[:100]
len(test_set)

100

In [21]:
# Naive Bayes with NLTK
classifier_NB = nltk.NaiveBayesClassifier.train(train_set)
print("\nThe accuracy of NB is equal to: ", nltk.classify.accuracy(classifier_NB, test_set))
classifier_NB.show_most_informative_features(30)
sorted(classifier_NB.labels())

print("cunning feature is for", classifier_NB.classify(find_features('shower')))
print("unimaginative feature is for", classifier_NB.classify(find_features('unimaginative')))


The accuracy of NB is equal to:  0.8
Most Informative Features
           unimaginative = True              neg : pos    =      8.5 : 1.0
              schumacher = True              neg : pos    =      7.5 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0
                  turkey = True              neg : pos    =      6.7 : 1.0
                 singers = True              pos : neg    =      6.2 : 1.0
                  justin = True              neg : pos    =      5.9 : 1.0
                 jumbled = True              neg : pos    =      5.8 : 1.0
                 bronson = True              neg : pos    =      5.8 : 1.0
               underwood = True              neg : pos    =      5.8 : 1.0
                  canyon = True     

In [22]:
# Decision Tree with NLTK
classifier_DT = nltk.classify.DecisionTreeClassifier.train(train_set, entropy_cutoff=0, support_cutoff=0)
print("\nThe accuracy of DT is equal to: ", nltk.classify.accuracy(classifier_DT, test_set))
print(classifier_DT)


The accuracy of DT is equal to:  0.62
bad=False? ............................................ pos
  boring=False? ....................................... pos
    waste=False? ...................................... pos
      mess=False? ..................................... pos
      mess=True? ...................................... neg
    waste=True? ....................................... neg
      whether=False? .................................. neg
      whether=True? ................................... pos
  boring=True? ........................................ pos
    deal=False? ....................................... neg
      memorable=False? ................................ neg
      memorable=True? ................................. pos
    deal=True? ........................................ pos
      father=False? ................................... pos
      father=True? .................................... neg
bad=True? ............................................. neg
 

In [23]:
# Sklearn Classifier with NLTK
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
classifier_SKLC = SklearnClassifier(BernoulliNB()).train(train_set)
print("\nThe accuracy of SKLC is equal to: ", nltk.classify.accuracy(classifier_SKLC, test_set))


The accuracy of SKLC is equal to:  0.81


In [24]:
# SVM with NLTK
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
classifier_SVC = SklearnClassifier(SVC(), sparse=False).train(train_set)
print("\nThe accuracy of SVC is equal to: ", nltk.classify.accuracy(classifier_SVC, test_set))


The accuracy of SVC is equal to:  0.84


In [25]:
# MAXENT with NLTK
from nltk.classify import maxent
encoding = maxent.TypedMaxentFeatureEncoding.train(train_set, count_cutoff=3, alwayson_features=True)
classifier_MAX = maxent.MaxentClassifier.train(train_set, bernoulli=False, encoding=encoding, trace=0)
print("\nThe accuracy of MAXENT is equal to: ", nltk.classify.accuracy(classifier_MAX, test_set))
classifier_MAX.show_most_informative_features(20)


The accuracy of MAXENT is equal to:  0.41
    -inf two==True and label is 'neg'
    -inf teen==False and label is 'neg'
    -inf couples==False and label is 'neg'
    -inf to==True and label is 'neg'
    -inf a==True and label is 'neg'
    -inf church==False and label is 'neg'
    -inf party==False and label is 'neg'
    -inf ,==True and label is 'neg'
    -inf drink==False and label is 'neg'
    -inf and==True and label is 'neg'
    -inf drive==False and label is 'neg'
    -inf .==True and label is 'neg'
    -inf get==False and label is 'neg'
    -inf into==True and label is 'neg'
    -inf of==True and label is 'neg'
    -inf the==True and label is 'neg'
    -inf guys==False and label is 'neg'
    -inf dies==False and label is 'neg'
    -inf his==True and label is 'neg'
    -inf girlfriend==False and label is 'neg'


In [26]:
# Overall Accuracies - Using Movie Review dataset and NLTK package
print("~~~~~~~~~~~~~~~~~~ ACCURACY VALUES ~~~~~~~~~~~~~~~~~~~~")
print("\nThe accuracy of NB  is equal to: ", nltk.classify.accuracy(classifier_NB, test_set))
print("\nThe accuracy of DT is equal to: ", nltk.classify.accuracy(classifier_DT, test_set))
print("\nThe accuracy of SKLC is equal to: ", nltk.classify.accuracy(classifier_SKLC, test_set))
print("\nThe accuracy of SVC is equal to: ", nltk.classify.accuracy(classifier_SVC, test_set))
print("\nThe accuracy of MAXENT is equal to: ", nltk.classify.accuracy(classifier_MAX, test_set))


~~~~~~~~~~~~~~~~~~ ACCURACY VALUES ~~~~~~~~~~~~~~~~~~~~

The accuracy of NB  is equal to:  0.8

The accuracy of DT is equal to:  0.62

The accuracy of SKLC is equal to:  0.81

The accuracy of SVC is equal to:  0.84

The accuracy of MAXENT is equal to:  0.41
