In [None]:
pip install "numpy<2"


## Text Classification with SpaCy
 #### 1.Load SpaCy and process text.
 #### 2.Extract linguistic features (POS, entities, etc.).
 #### 3.Train a classifier using features.
 #### 4. Evaluate the classifier.
 

#### Import Neccesary Libraries

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier

#### We create a small dataset with sentences labeled as "pos" (positive) or "neg" (negative).

In [3]:
# Step 1: Sample dataset of sentences labeled as positive or negative
training_data = [
    ("I love this movie", "pos"),
    ("This film is amazing", "pos"),
    ("I hated this movie", "neg"),
    ("This film is terrible", "neg")
]

# Print the training data to verify
for sentence, label in training_data:
    print(f"Text: {sentence} | Label: {label}")


Text: I love this movie | Label: pos
Text: This film is amazing | Label: pos
Text: I hated this movie | Label: neg
Text: This film is terrible | Label: neg


##  Preprocessing:

In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download stopwords if you haven't already
nltk.download('stopwords')
nltk.download('punkt_tab')

def extract_features(sentence):
    words = word_tokenize(sentence.lower())
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return {word: True for word in words}

# Define training data
training_data = [
    ("I love programming", "positive"),
    ("This is so boring", "negative"),
    ("Python is awesome", "positive"),
    ("I hate bugs", "negative")
]

training_features = [(extract_features(sentence), label) for sentence, label in training_data]
print(training_features)


[({'love': True, 'programming': True}, 'positive'), ({'boring': True}, 'negative'), ({'python': True, 'awesome': True}, 'positive'), ({'hate': True, 'bugs': True}, 'negative')]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neeru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\neeru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Naive Bayes Classifier: We train the classifier on the feature set.

In [17]:
 # Step 3: Train Naive Bayes Classifier
 classifier = NaiveBayesClassifier.train(training_features)

## Testing: We classify new sentences using the trained classifier.

In [19]:
# Step 4: Test the classifier with new sentences
test_sentences = [
    "I really enjoyed this movie",
    "This movie was awful"
]

for sentence in test_sentences:
    features = extract_features(sentence)
    predicted_label = classifier.classify(features)
    print(f"Sentence: '{sentence}' => Predicted Sentiment: {predicted_label}")


Sentence: 'I really enjoyed this movie' => Predicted Sentiment: positive
Sentence: 'This movie was awful' => Predicted Sentiment: positive


 ## 1. Understanding Text Classification
 #### Text classification involves assigning a category to a given piece of text. For example:
 #### • Labeling emails as "Spam" or "Not Spam."
 #### • Categorizing movie reviews as "Positive" or "Negative."
 #### In this exercise, we will classify movie reviews as either Positive or Negative using NLTK.

In [21]:
# Import necessary libraries
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy




In [23]:
import nltk

# Download necessary datasets
nltk.download('movie_reviews')  # Movie review dataset
nltk.download('stopwords')      # Stopwords for preprocessing
nltk.download('punkt')          # Tokenizer for text processing


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\neeru\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neeru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neeru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. Dataset Preparation

In [27]:
import nltk
from nltk.corpus import movie_reviews

# Make sure you have downloaded the dataset before running this:
# nltk.download('movie_reviews')

# Create list of documents: each document is a tuple (list_of_words, category)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


##  3. Preprocessing Steps

In [28]:
# Tokenization
# Example of tokenizing one document for clarity
sample_document = documents[0][0]  # Get the first document (list of words)
print(f"Original Document (first 20 words): {sample_document[:20]}")

# Lowercasing
# Convert all words to lowercase
lowercased_words = [word.lower() for word in sample_document]
print(f"Lowercased Words (first 20): {lowercased_words[:20]}")

# Removing Non-Alphabetic Tokens
# Remove words that are not purely alphabetic
alphabetic_words = [word for word in lowercased_words if word.isalpha()]
print(f"Alphabetic Words (first 20): {alphabetic_words[:20]}")

# Stop Word Removal
# Load stopwords and remove them from the dataset
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in alphabetic_words if word not in stop_words]
print(f"Filtered Words (No Stopwords, first 20): {filtered_words[:20]}")


Original Document (first 20 words): ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']
Lowercased Words (first 20): ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']
Alphabetic Words (first 20): ['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink', 'and', 'then', 'drive', 'they', 'get', 'into', 'an', 'accident', 'one', 'of']
Filtered Words (No Stopwords, first 20): ['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal']


## 4. Feature Extraction

In [31]:
import nltk
from nltk.corpus import movie_reviews, stopwords

# Assuming nltk data is downloaded:
# nltk.download('movie_reviews')
# nltk.download('stopwords')

# Prepare documents as (list_of_words, category) tuples
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Create a frequency distribution of all words in the corpus (lowercased and alphabetic only)
all_words = nltk.FreqDist(
    word.lower() for word in movie_reviews.words() if word.isalpha()
)

# Use the 2000 most common words as features
word_features = list(all_words.keys())[:2000]

# Define a feature extractor function
def document_features(document):
    document_words = set(document)
    features = {word: (word in document_words) for word in word_features}
    return features

# Load stopwords set
stop_words = set(stopwords.words('english'))

# Preprocess all documents: lowercase, remove stopwords & non-alphabetic tokens, extract features
preprocessed_documents = []
for (doc, category) in documents:
    filtered_words = [
        word.lower() for word in doc if word.isalpha() and word.lower() not in stop_words
    ]
    features = document_features(filtered_words)
    preprocessed_documents.append((features, category))



## 5. Training a Naive Bayes Classifier

In [33]:
from nltk.classify import NaiveBayesClassifier

# Split the dataset into training and testing sets (80% training, 20% testing)
train_set = preprocessed_documents[:1600]
test_set = preprocessed_documents[1600:]

# Train the Naive Bayes Classifier using the training set
classifier = NaiveBayesClassifier.train(train_set)


## 6. Testing and Evaluating the Model

In [35]:
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize

# Evaluate the Model
model_accuracy = accuracy(classifier, test_set) * 100
print(f"Accuracy: {model_accuracy:.2f}%")

# Test with New Data
test_review = "This movie was absolutely great, with great performances and a good story."
test_tokens = word_tokenize(test_review)
test_words = [word.lower() for word in test_tokens if word.isalpha() and word.lower() not in stop_words]
test_features = document_features(test_words)

# Predict and display result
prediction = classifier.classify(test_features)
print(f"Prediction for test review: {prediction}")

# Display the Most Informative Features
print("\nMost Informative Features:")
classifier.show_most_informative_features(10)


Accuracy: 74.25%
Prediction for test review: neg

Most Informative Features:
Most Informative Features
                   chick = True              neg : pos    =      8.6 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
              undercover = True              neg : pos    =      7.8 : 1.0
              derivative = True              neg : pos    =      7.0 : 1.0
                  inject = True              neg : pos    =      7.0 : 1.0
                 justify = True              neg : pos    =      6.2 : 1.0
                   banal = True              neg : pos    =      5.8 : 1.0
                bothered = True              neg : pos    =      5.8 : 1.0
                     ugh = True              neg : pos    =      5.8 : 1.0
                   waste = True              neg : pos    =      5.7 : 1.0
