In [76]:
from nltk.corpus import brown
from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import reuters
from nltk.corpus import inaugural

# **Corpus and Corpora**

## **Corpus** refers to one collection of texts.

## **Corpora** refers to multiple collections of texts

## **popular Corpora**

- **The Brown Corpus**: Created by Brown University in 1961, this was the first million-word electronic corpus of English.

- **The Gutenberg Corpus**: Containing 25,000 free electronic books, this data was taken from the Project Gutenberg electronic text archive of literature.

- **The Web Text Corpus**: Containing less formal language, this data includes content from online discussion forums, personal advertisements, and reviews.

- **The NPS Chat Corpus**: Originally created by the Naval Postgraduate School, this corpus contains over 10,000 posts from instant messaging chats.

- **The Reuters Corpus**: This corpus contains over 10,000 news documents, grouped into two sets called 'training' and 'test'.

- **The Inaugural Address Corpus**: This corpus contains each presidential inaugural address.


In [77]:
print(f'\n inaugural corpus is broken up by word: {inaugural.words()}')


 inaugural corpus is broken up by word: ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...]


In [78]:
print(f'\n using sents(): {inaugural.sents()}')


 using sents(): [['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]


In [79]:
print(f'\n using sents(): {inaugural.paras()}')


 using sents(): [[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':']], [['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ['On', 'the', 'one', 'hand', ',', 'I', 'was', 'summoned', 'by', 'my', 'Country', ',', 'whose', 'voice', 'I', 'can', 'never', 'hear', 'but', 'with', 'veneration', 'and', 'love', ',', 'from', 'a', 'retreat', 'which', 'I', 'had', 'chosen', 'with', 'the', 'fondest', 'predilection', ',', 'and', ',', 'in', 'my', 'flattering', 'hopes', ',', 'with', 'an', 'immutable', 'decision', ',', 'as', 'the', 'asylum', 'of', 'my', 'declining', 'years', '--', 'a', 'retreat', 'which', 'was', 'rendered', 'every', 'day', 'more', 'necessary', 'as', 'well', '

In [80]:
print(f'Brown tagged-words(): {brown.tagged_words()}')

Brown tagged-words(): [('The', 'AT'), ('Fulton', 'NP-TL'), ...]


# Lexicon Definition

**Lexicon**: Often referred to as a lexical resource, a lexicon is a collection of words and/or phrases, such as a set of vocabulary or a dictionary, marked with allied information such as each given word’s part of speech or definition. A lexicon is often considered a type of corpus because it represents text data.

## Accessing Lexicons in NLTK

Several lexicons are included in the NLTK library. Some of the commonly used lexicons are:

### The Stopwords Corpus

Words like "me", "has", "also", and "to" are all examples of stop words. Stop words add little meaning to text data. This lexicon corpus contains stop words in English.

### The Names Corpus

Categorized by gender, this lexicon corpus contains over 8,000 first names. Female names are stored in the file called `female.txt` and male names in the file called `male.txt`.

### The CMU Pronouncing Dictionary Corpus

Based on US English, this lexicon corpus contains the phonetic pronunciations of words. Each phonetic pronunciation is represented with a symbol based on the Arpabet.

## Access Code for Lexicons

To access each of the above-mentioned lexicons, enter the code below:

```python
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk.corpus import cmudict
```


In [81]:
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk.corpus import cmudict
import nltk

In [84]:
female_names = names.words('female.txt')
female_names_endingwith_a = [element for element in female_names if element.endswith('a')]
female_percentage = 100* len(female_names_endingwith_a)/ len(female_names)
print(female_percentage)

35.45290941811638


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample documents
documents = [
    "Machine learning is great.",
    "Deep learning is a subset of machine learning.",
    "Artificial intelligence includes machine learning and deep learning."
]

# Create the TDM
vectorizer = CountVectorizer()
tdm = vectorizer.fit_transform(documents)

# Convert to DataFrame
tdm_df = pd.DataFrame(tdm.toarray(), columns=vectorizer.get_feature_names_out())
print(tdm_df)


   and  artificial  deep  great  includes  intelligence  is  learning  \
0    0           0     0      1         0             0   1         1   
1    0           0     1      0         0             0   1         2   
2    1           1     1      0         1             1   0         2   

   machine  of  subset  
0        1   0       0  
1        1   1       1  
2        1   0       0  


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df)


        and  artificial      deep    great  includes  intelligence        is  \
0  0.000000    0.000000  0.000000  0.66284  0.000000      0.000000  0.504107   
1  0.000000    0.000000  0.343538  0.00000  0.000000      0.000000  0.343538   
2  0.397699    0.397699  0.302460  0.00000  0.397699      0.397699  0.000000   

   learning   machine        of    subset  
0  0.391484  0.391484  0.000000  0.000000  
1  0.533575  0.266788  0.451711  0.451711  
2  0.469775  0.234887  0.000000  0.000000  
