### Text Classification
## Naive Bayes

In [1]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# Load movie reviews dataset
nltk.download('movie_reviews')

# Create a list of documents (each document is a list of words and a category label)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Define a feature extractor function
def document_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in words)
    return features

# Create a frequency distribution of words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]  # Use the top 2000 words as features

# Apply the feature extractor to each document
featuresets = [(document_features(d), c) for (d, c) in documents]

# Split the data into training and testing sets
train_set, test_set = featuresets[100:], featuresets[:100]

# Train the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print("Naive Bayes Accuracy:", accuracy(classifier, test_set))

# Show the most informative features
classifier.show_most_informative_features(10)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Naive Bayes Accuracy: 0.81
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.6 : 1.0
        contains(seagal) = True              neg : pos    =      8.2 : 1.0
          contains(lame) = True              neg : pos    =      6.3 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
         contains(awful) = True              neg : pos    =      5.8 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
        contains(wasted) = True              neg : pos    =      5.3 : 1.0
          contains(jedi) = True              pos : neg    =      5.3 : 1.0


## Decision Trees

In [2]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.classify import DecisionTreeClassifier
from nltk.classify.util import accuracy

# Load movie reviews dataset
nltk.download('movie_reviews')

# Create a list of documents (each document is a list of words and a category label)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Define a feature extractor function
def document_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in words)
    return features

# Create a frequency distribution of words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]  # Use the top 2000 words as features

# Apply the feature extractor to each document
featuresets = [(document_features(d), c) for (d, c) in documents]

# Split the data into training and testing sets
train_set, test_set = featuresets[100:], featuresets[:100]

# Train the Decision Tree classifier
classifier = DecisionTreeClassifier.train(train_set, entropy_cutoff=0.05, support_cutoff=10)

# Evaluate the classifier
print("Decision Tree Accuracy:", accuracy(classifier, test_set))

# Show the most informative features (if supported)
try:
    classifier.show_most_informative_features(10)
except:
    print("Decision Trees don't have a built-in method to show informative features.")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Decision Tree Accuracy: 0.68
Decision Trees don't have a built-in method to show informative features.


### Syntax and Parsing:
## Dependency Parser

In [3]:
#Implement Dependency Parser to generate the parse tree
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "The quick brown fox jumps over the lazy dog."
doc = nlp(sentence)

#Print dependency parse tree
for token in doc:
  print(token.text, token.dep_, token.head.text, token.head.pos_,
        [child for child in token.children])

The det fox NOUN []
quick amod fox NOUN []
brown amod fox NOUN []
fox nsubj jumps VERB [The, quick, brown]
jumps ROOT jumps VERB [fox, over, .]
over prep jumps VERB [dog]
the det dog NOUN []
lazy amod dog NOUN []
dog pobj over ADP [the, lazy]
. punct jumps VERB []


## Implementation of Chunking using Shallow parsing

In [6]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(text)
tagged_tokens = nltk.pos_tag(tokens)

chunk_grammar = r"""
     NP: {<DT|JJ|NN.*>+}
     PP: {<IN><NP>}
     VP: {<VB.*><NP|PP|CLAUSE>+$}
     CLAUSE: {<NP><VP>}
"""
chunk_parser = nltk.RegexpParser(chunk_grammar)
parsed_tree = chunk_parser.parse(tagged_tokens)
print(parsed_tree)
parsed_tree.pretty_print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


(S
  (NP The/DT quick/JJ brown/NN fox/NN)
  jumps/VBZ
  (PP over/IN (NP the/DT lazy/JJ dog/NN))
  ./.)
                                 S                                             
     ____________________________|____________________________                  
    |      |            |                                     PP               
    |      |            |                         ____________|_____            
    |      |            NP                       |                  NP         
    |      |     _______|________________        |       ___________|______     
jumps/VBZ ./. The/DT quick/JJ brown/NN fox/NN over/IN the/DT     lazy/JJ dog/NN



## Regex Parser

In [8]:
import nltk
from nltk import RegexpParser
from nltk import pos_tag, word_tokenize

# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize and perform POS tagging
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)
print("Tagged Sentence:", tagged)

# Define the grammar for a simple noun phrase (NP)
grammar = "NP: {<DT>?<JJ>*<NN>}"

# Create a regex parser with the defined grammar
cp = RegexpParser(grammar)

# Parse the sentence using the regex parser
parsed_sentence = cp.parse(tagged)
print("Parsed Sentence:")
print(parsed_sentence)

# Visualize the parsed tree (optional)
parsed_sentence.pprint()

Tagged Sentence: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]
Parsed Sentence:
(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)
(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)


## Maximum entropy classifier

In [9]:
#Maximum entropy classifier
import nltk
from nltk.classify import MaxentClassifier
from nltk.tokenize import word_tokenize
class MaxEntClassifier:
  def __init__(self):
    self.classifier= None
  def train(self,labeled_featuresets,algorithm='GIS',max_iter=10):
    self.classifier=MaxentClassifier.train(labeled_featuresets,algorithm=algorithm,max_iter=max_iter)
  def classify(self,featureset):
    return self.classifier.classify(featureset)
  def accuracy(self,test_data):
    return nltk.classify.accuracy(test_data)
training_data=[
    ({'feature1':10,'feature2':5},'class1'),
    ({'feature1':3,'feature2':8},'class2')
]
classifier=MaxEntClassifier()
classifier.train(training_data)
test_data={'feature1':12,'feature2':3}
predicted_class=classifier.classify(test_data)
print("The predicted class ",predicted_class)

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.500
             2          -0.33422        1.000
             3          -0.21130        1.000
             4          -0.15255        1.000
             5          -0.11874        1.000
             6          -0.09695        1.000
             7          -0.08179        1.000
             8          -0.07067        1.000
             9          -0.06217        1.000
         Final          -0.05548        1.000
The predicted class  class2


## Leftmost and Rightmost Derivation

In [10]:
# Leftmost Derivation
import nltk
from nltk import CFG
from nltk import ChartParser

# Define the grammar
grammar = CFG.fromstring("""
  S  -> E
  E  -> E '+' T | T
  T  -> T '*' F | F
  F  -> '(' E ')' | 'id'
""")

# Define the sentence
sentence = ['id', '+', 'id', '*', 'id']

# Create a parser with the given grammar
parser = ChartParser(grammar)

# Function to generate leftmost derivation
def leftmost_derivation(sentence, grammar):
    parser = ChartParser(grammar)
    trees = list(parser.parse(sentence))
    if not trees:
        return []

    # Take the first parse tree
    tree = trees[0]

    # Collect derivation steps
    derivation_steps = []

    def derivation_from_tree(tree):
        # Convert tree to string format and add to steps
        derivation_steps.append(" ".join(tree.leaves()))
        for production in tree.productions():
            lhs = str(production.lhs())
            rhs = " ".join(str(x) for x in production.rhs())
            derivation_steps.append(f"{lhs} -> {rhs}")
        if tree.label() == 'S':
            return
        for subtree in tree:
            if isinstance(subtree, nltk.Tree) and subtree.label() != 'S':
                derivation_from_tree(subtree)

    derivation_from_tree(tree)

    return derivation_steps

# Generate leftmost derivation
derivations = leftmost_derivation(sentence, grammar)

# Print the derivation steps
print("Leftmost Derivation Steps:")
for step in derivations:
    print(step)

Leftmost Derivation Steps:
id + id * id
S -> E
E -> E + T
E -> T
T -> F
F -> id
T -> T * F
T -> F
F -> id
F -> id


In [19]:
# Rightmost Derivation
import nltk
from nltk import CFG
from nltk import ChartParser

# Define the grammar
grammar = CFG.fromstring("""
  S  -> E
  E  -> E '+' T | T
  T  -> T '*' F | F
  F  -> '(' E ')' | 'id'
""")

# Define the sentence
sentence = ['id', '+', 'id', '*', 'id']

# Create a parser with the given grammar
parser = ChartParser(grammar)

# Function to generate leftmost derivation
def rightmost_derivation(sentence, grammar):
    parser = ChartParser(grammar)
    trees = list(parser.parse(sentence))
    if not trees:
        return []

    # Take the first parse tree
    tree = trees[0]

    # Collect derivation steps
    derivation_steps = []

    def derivation_from_tree(tree):
        # Convert tree to string format and add to steps
        derivation_steps.append(" ".join(tree.leaves()))
        for production in tree.productions():
            lhs = str(production.lhs())
            rhs = " ".join(str(x) for x in production.rhs())
            derivation_steps.append(f"{lhs} -> {rhs}")
        if tree.label() == 'S':
            return
        for subtree in tree:
            if isinstance(subtree, nltk.Tree) and subtree.label() != 'S':
                derivation_from_tree(subtree)

    derivation_from_tree(tree)

    return derivation_steps

# Generate leftmost derivation
derivations = rightmost_derivation(sentence, grammar)

# Print the derivation steps
print("Rightmost Derivation Steps:")
for step in derivations:
    print(step)

Rightmost Derivation Steps:
id + id * id
S -> E
E -> E + T
E -> T
T -> F
F -> id
T -> T * F
T -> F
F -> id
F -> id
