# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Sample dataset (replace with your actual dataset)
sample_data = {
    'text': ["This movie is amazing!", "I'm so frustrated with this product.", "It's okay, I guess.", "This is terrible!", "I love it!", "It's not bad.", "I hate it."],
    'sentiment': ["positive", "negative", "neutral", "negative", "positive", "neutral", "negative"]
}
df = pd.DataFrame(sample_data)

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment prediction
def predict_sentiment(text):
    """
    Predicts the sentiment of a given text using VADER.

    Args:
        text: The input text string.

    Returns:
        A dictionary containing sentiment scores ('neg', 'neu', 'pos', 'compound').
    """
    scores = analyzer.polarity_scores(text)
    return scores

# Example usage
user_input = input("Enter text: ")
prediction = predict_sentiment(user_input)

print("Sentiment Scores:", prediction)

# You can use the 'compound' score to determine the overall sentiment
# or individual scores ('neg', 'neu', 'pos') to determine the sentiment labels
if prediction['compound'] >= 0.05:
    print("Sentiment: Positive")
elif prediction['compound'] <= -0.05:
    print("Sentiment: Negative")
else:
    print("Sentiment: Neutral")

LookupError: 
**********************************************************************
  Resource [93mvader_lexicon[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('vader_lexicon')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93msentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [9]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    """
    Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.

    Args:
        text: The input text.

    Returns:
        A list of cleaned sentences.
    """

    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if not w.lower() in stop_words]

    # Tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
cv = CountVectorizer()
X_bow = cv.fit_transform(df['cleaned'])

# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned'])

NameError: name 'df' is not defined

## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and testing sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(
    X_bow, df['label'], test_size=0.2, random_state=42
)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
    X_tfidf, df['label'], test_size=0.2, random_state=42
)

# Train Naive Bayes classifier with Bag of Words features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)

# Train Naive

NameError: name 'X_bow' is not defined

## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [7]:
def predict_sentiment(text):
    """Predicts the sentiment of a given text using the trained TF-IDF Naive Bayes model.

    Args:
        text: The input text.

    Returns:
        The predicted sentiment ("Positive" or "Negative").
    """
    # Preprocess the text
    cleaned_text = preprocess(text)  # Assuming you have a preprocess function defined

    # Transform the text using TF-IDF
    vectorized_text = tfidf.transform([cleaned_text])  # Assuming you have a tfidf object defined

    # Predict the sentiment using the trained model
    prediction = nb_tfidf.predict(vectorized_text)  # Assuming you have a nb_tfidf object defined

    # Return the sentiment label
    return "Positive" if prediction[0] == 1 else "Negative"

# Example usage
user_text = input("

SyntaxError: unterminated string literal (detected at line 23) (<ipython-input-7-424174df38fb>, line 23)