# Imports

In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('names')
from nltk.corpus import names
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


# Exploring Names Corpus

In [2]:
def gender_features(word):
    return{'last_letter':word[-1]}

In [3]:
print(len(names.words()), names.words()[50:100])

7944 ['Aeriell', 'Ag', 'Agace', 'Agata', 'Agatha', 'Agathe', 'Aggi', 'Aggie', 'Aggy', 'Agna', 'Agnella', 'Agnes', 'Agnese', 'Agnesse', 'Agneta', 'Agnola', 'Agretha', 'Aida', 'Aidan', 'Aigneis', 'Aila', 'Aile', 'Ailee', 'Aileen', 'Ailene', 'Ailey', 'Aili', 'Ailina', 'Ailyn', 'Aime', 'Aimee', 'Aimil', 'Aina', 'Aindrea', 'Ainslee', 'Ainsley', 'Ainslie', 'Ajay', 'Alaine', 'Alameda', 'Alana', 'Alanah', 'Alane', 'Alanna', 'Alayne', 'Alberta', 'Albertina', 'Albertine', 'Albina', 'Alecia']


In [4]:
labeled_names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
print(labeled_names[:50])

[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male'), ('Adams', 'male'), ('Addie', 'male'), ('Adger', 'male'), ('Aditya', 'male'), ('Adlai', 'male'), ('Adnan', 'male'), ('Adolf', 'male'), ('Adolfo', 'male'), ('Adolph', 'male'), ('Adolphe', 'male'), ('Adolpho', 'male'), ('Adolphus', 'male'), ('Adrian', 'male'), ('Adrick', 'male'), ('Adrien', 'male'), ('Agamemnon', 'male'), ('Aguinaldo', 'male'), ('Aguste', 'male'), ('Agustin', 'male'), ('Aharon', 'male'), ('Ahmad', 'male'), ('Ahmed', 'male'), ('Ahmet', 'male'), ('Ajai', 'male'), ('Ajay', 'male'), ('Al', 'male'), ('Alaa', 'male'), ('Alain', 'male'), ('Alan', 'male'), ('Alasdair', 'male')]


In [5]:
random.shuffle(labeled_names)
print(labeled_names[:50])

[('Keefe', 'male'), ('Fowler', 'male'), ('Willie', 'female'), ('Zechariah', 'male'), ('Deanna', 'female'), ('Whittaker', 'male'), ('Roddy', 'male'), ('Joslyn', 'female'), ('Carolie', 'female'), ('Rudyard', 'male'), ('Ephraim', 'male'), ('Pauly', 'female'), ('Teddi', 'female'), ('Danni', 'female'), ('Teresa', 'female'), ('Tyne', 'female'), ('Florry', 'female'), ('Idelle', 'female'), ('Pattie', 'female'), ('Harwell', 'male'), ('Niels', 'male'), ('Gustavo', 'male'), ('Phaidra', 'female'), ('Nikolia', 'female'), ('Ottilie', 'female'), ('Demetra', 'female'), ('Sadie', 'female'), ('Lizette', 'female'), ('Judas', 'male'), ('Mercedes', 'female'), ('Teodoro', 'male'), ('Waldo', 'male'), ('Esmerelda', 'female'), ('Dorolice', 'female'), ('Lyle', 'male'), ('Moyra', 'female'), ('Amberly', 'female'), ('Jessika', 'female'), ('Sunny', 'male'), ('Glad', 'female'), ('Martainn', 'male'), ('Aurea', 'female'), ('Ronny', 'male'), ('Mair', 'female'), ('Florian', 'male'), ('Clareta', 'female'), ('Lindsay', 'm

In [6]:
featureset = [(gender_features(n),gender) for (n,gender) in labeled_names]
print(featureset[:50])

[({'last_letter': 'e'}, 'male'), ({'last_letter': 'r'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'h'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'r'}, 'male'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 'm'}, 'male'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'o'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 's'}, 'male'), ({'last_letter': 's'}, 'female'), (

In [7]:
train_set,test_set = featureset[:5500],featureset[5501:]
print(test_set[:50])

[({'last_letter': 'y'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'p'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'o'}, 'male'), ({'last_letter': 'l'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'

# Using NLTK Naive Bayes to classify names corpus

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Mahavir'))

'male'

In [9]:
classifier.classify(gender_features('Zil'))

'male'

In [10]:
classifier.classify(gender_features('Bhavya'))

'female'

In [11]:
nltk.classify.accuracy(classifier,test_set)

0.7560376586164552

# Exploring sklearn vectorizer and similarity functions

In [12]:
corpus =  ["Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.",
           '''Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.''']
vect = CountVectorizer(binary=True).fit(corpus)

In [13]:
vocab = vect.vocabulary_
print(vocab)

{'natural': 39, 'language': 35, 'processing': 47, 'nlp': 40, 'is': 33, 'subfield': 54, 'of': 43, 'linguistics': 37, 'computer': 17, 'science': 52, 'and': 7, 'artificial': 10, 'intelligence': 29, 'concerned': 20, 'with': 65, 'the': 58, 'interactions': 30, 'between': 14, 'computers': 18, 'human': 27, 'in': 28, 'particular': 44, 'how': 26, 'to': 61, 'program': 48, 'process': 46, 'analyze': 6, 'large': 36, 'amounts': 4, 'data': 22, 'has': 25, 'its': 34, 'roots': 51, '1950s': 1, 'already': 3, '1950': 0, 'alan': 2, 'turing': 62, 'published': 50, 'an': 5, 'article': 8, 'titled': 60, 'computing': 19, 'machinery': 38, 'which': 64, 'proposed': 49, 'what': 63, 'now': 42, 'called': 16, 'test': 56, 'as': 11, 'criterion': 21, 'task': 55, 'that': 57, 'involves': 32, 'automated': 13, 'interpretation': 31, 'generation': 24, 'but': 15, 'at': 12, 'time': 59, 'not': 41, 'articulated': 9, 'problem': 45, 'separate': 53, 'from': 23}


In [14]:
vect.transform(["The history of natural language processing describes the advances of natural language processing. There is some overlap with the history of machine translation, the history of speech recognition, and the history of artificial intelligence."]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]])

In [15]:
corpus2 = ["Mahavir Gala can sit in front of a pc for 20 hours a day.","Mahavir Gala cannot sit in front of a TV for 15 minutes a day"]
vect2 = CountVectorizer(binary=True).fit_transform(corpus2)
vect2 = vect2.toarray()
similarity = cosine_similarity(vect2)
print(similarity[0][1])

0.6666666666666669
