In [1]:
import nltk

In [2]:
from nltk.corpus import brown

In [3]:
# Steps:

# Get the data/corpus
# Tokenisation/Stopword Removal
# Stemming
# Building a vocab
# Vectorization
# Classification

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
len(brown.categories())

15

In [5]:
data=brown.sents(categories='humor')

In [6]:
len(data)

1053

In [7]:
' '.join(data[0])

'It was among these that Hinkle identified a photograph of Barco ! !'

In [8]:
## Tokeinzation

In [9]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [10]:
document="""It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits."""
sentence="Send all the 50 documents related to chapters 1,2,3 ta prateek@cb.com"

In [11]:
sents=sent_tokenize(document)
print(sents)

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [12]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'ta',
 'prateek@cb.com']

In [13]:
words=word_tokenize(sentence)
print(words)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'ta', 'prateek', '@', 'cb.com']


In [14]:
# Stopwords Removal

In [15]:
from nltk.corpus import stopwords

In [16]:
sw=set(stopwords.words('english'))

In [17]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [18]:
print(sw)

{'mightn', "hasn't", 'until', 'hasn', 'shouldn', 'being', 'under', 'they', 'other', 'aren', 'does', 'through', 'd', 'whom', 'will', 'before', 'hers', 'if', 'own', 'few', "mightn't", 'be', 'its', 'when', "you'll", 'of', 'ourselves', "haven't", 'theirs', 'couldn', 'now', 'then', 'myself', 'down', 'between', 'won', 'those', "mustn't", "shan't", 'more', 'for', "needn't", 'doing', "wouldn't", 'which', 'how', "it's", 'who', 'there', 'he', "you'd", 'having', 'only', "that'll", 'by', 're', "wasn't", 'my', 'them', 'do', 'because', 'most', 'during', 'yours', 'up', 'here', 'these', 'not', 'him', 'themselves', 'that', 'below', 'mustn', 'is', "you're", 'been', "couldn't", 'shan', 'with', 'himself', 'our', 'am', 'and', 'yourselves', 'an', 'your', 'but', 'too', "shouldn't", 'further', 'didn', 'itself', 'so', 'isn', 'weren', 'was', "doesn't", 'off', 'once', 's', 'than', 'from', 'no', 'wasn', 'against', 'where', 've', 'her', 'yourself', 'both', 'such', 'needn', 'have', 'doesn', 'each', 'into', 'can', '

In [19]:
def remove_stpwrds(text,stopwords):
    useful_words=[w for w in text if w not in stopwords]
    return useful_words

In [20]:
text="hii there, how have you been? Its been a long time we met. I missed you.".split()
print(remove_stpwrds(text,sw))


['hii', 'there,', 'been?', 'Its', 'long', 'time', 'met.', 'I', 'missed', 'you.']


In [21]:
# Regex based Tokenization

In [22]:
from nltk.tokenize import RegexpTokenizer

In [23]:
tokenizer=RegexpTokenizer('[a-zA-Z@.]+')
useful_text=tokenizer.tokenize(sentence)

In [24]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'ta',
 'prateek@cb.com']

In [25]:
# Stemming
# process that transforms words into their radical form
# used to preserve semantics of thesentence without increasing number of unique words
#  jumps, jumping, jumped all converted to 'jump'
# stemmers are used to perfrom this process
# there are three stemmers:
# Porter, Lancaster, Snowball

In [26]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [27]:


ps=PorterStemmer()

In [28]:
ps.stem('jumping')

'jump'

In [29]:
ss=SnowballStemmer('english')

In [30]:
ss.stem('lovely')

'love'

In [31]:
from nltk.stem import WordNetLemmatizer

In [32]:
wn=WordNetLemmatizer()

In [33]:
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [34]:
wn.lemmatize('jumps')

'jump'

In [35]:
# constructing Vocab

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
corpus=[
    'Indian Cricket Team will win World Cup, says Indian Team Captain V. Kohli. World Cup will be held in Sri Lanka',
    'We will win next Lok Sabha Elections, says confident PM of India',
    'The noble laurate won the hearts of people',
    'The movie Raazi is an Indian spy thriller'
]

In [38]:
cv=CountVectorizer()

In [39]:
vectorized_corpus=cv.fit_transform(corpus)

In [40]:
vectorized_corpus

<4x36 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [41]:
vectorized_corpus=vectorized_corpus.toarray()

In [42]:
vectorized_corpus[0]

array([0, 1, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 2, 1, 0, 2], dtype=int64)

In [43]:
cv.vocabulary_

{'indian': 11,
 'cricket': 4,
 'team': 28,
 'will': 32,
 'win': 33,
 'world': 35,
 'cup': 5,
 'says': 25,
 'captain': 2,
 'kohli': 13,
 'be': 1,
 'held': 8,
 'in': 9,
 'sri': 27,
 'lanka': 14,
 'we': 31,
 'next': 18,
 'lok': 16,
 'sabha': 24,
 'elections': 6,
 'confident': 3,
 'pm': 22,
 'of': 20,
 'india': 10,
 'the': 29,
 'noble': 19,
 'laurate': 15,
 'won': 34,
 'hearts': 7,
 'people': 21,
 'movie': 17,
 'raazi': 23,
 'is': 12,
 'an': 0,
 'spy': 26,
 'thriller': 30}

In [44]:
numbers=vectorized_corpus[2]

In [45]:
numbers

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0], dtype=int64)

In [46]:
s=cv.inverse_transform(numbers)
s

[array(['hearts', 'laurate', 'noble', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

In [47]:
# TFIDF
# term frequency-inverse document frequency
# product of tf and idf
# avoid features that occur very often, beacause they contain less information
#Info decreases as number of occurences increases
# tfidf associates wt with every term from 0-1
# tf(term,document)
# idf(term,doument)=log(No of Docs)/(1+count(term,Across all Documents))

# tf*idf

In [48]:
sent1="this is good movie"
sent2="this was good movie"
sent3="this is not a good movie"

In [49]:
corpus=[sent1,sent2,sent3]

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tfidf=TfidfVectorizer?

In [None]:
tfidf=TfidfVectorizer

In [52]:
vc=tfidf.fit_transform(corpus).toarray()

In [70]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [71]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}