<a href="https://colab.research.google.com/github/Kulpreet-prog/NIELIT-FSK-PRIME-April21/blob/main/copy_of_task_7__introduction_to_natural_language_text_processing_ai_bootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Sample Dataset for Sentiment Classification (positive/negative)
data = {
    'text': [
        'I love this product, it is amazing!',
        'Worst purchase I have ever made.',
        'Highly recommend, this is a great buy.',
        'Not worth the price, very disappointed.',
        'I am so happy with my new purchase.',
        'Very bad experience, would not buy again.',
        'Absolutely fantastic! Best decision ever.',
        'Terrible, do not buy this product.',
        'Love it! Will buy again.',
        'Worst customer service ever.'
    ],
    'sentiment': [
        'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative'
    ]
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print(df.head())


                                      text sentiment
0      I love this product, it is amazing!  positive
1         Worst purchase I have ever made.  negative
2   Highly recommend, this is a great buy.  positive
3  Not worth the price, very disappointed.  negative
4      I am so happy with my new purchase.  positive


## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [3]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk

# Download stopwords only
nltk.download('stopwords')

# Sample dataset
data = {
    'text': [
        'I love this product! It is amazing.',
        'This is the worst experience ever.',
        'Absolutely fantastic service!',
        'I am not happy with the quality.',
        'Best purchase I have made!'
    ]
}
df = pd.DataFrame(data)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Simple tokenization by split
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning to the dataset
df['cleaned_text'] = df['text'].apply(clean_text)

# Display the cleaned text
print(df[['text', 'cleaned_text']])







                                  text                      cleaned_text
0  I love this product! It is amazing.          [love, product, amazing]
1   This is the worst experience ever.         [worst, experience, ever]
2        Absolutely fantastic service!  [absolutely, fantastic, service]
3     I am not happy with the quality.                  [happy, quality]
4           Best purchase I have made!            [best, purchase, made]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# First, join tokens back into sentences because vectorizers expect string input
df['cleaned_text_joined'] = df['cleaned_text'].apply(lambda x: ' '.join(x))

# Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['cleaned_text_joined'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text_joined'])

# Check the shape of matrices
print("Bag of Words Shape:", X_bow.shape)
print("TF-IDF Shape:", X_tfidf.shape)


Bag of Words Shape: (5, 14)
TF-IDF Shape: (5, 14)


## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [5]:
print(df.head())
# Add sentiment manually
df['sentiment'] = ['positive', 'negative', 'positive', 'negative', 'positive']
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the data
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['sentiment'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Model 1: Bag of Words
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)
y_pred_bow = model_bow.predict(X_test_bow)

# Model 2: TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Accuracy
print("Accuracy using Bag of Words:", accuracy_score(y_test, y_pred_bow))
print("Accuracy using TF-IDF:", accuracy_score(y_test, y_pred_tfidf))



                                  text                      cleaned_text  \
0  I love this product! It is amazing.          [love, product, amazing]   
1   This is the worst experience ever.         [worst, experience, ever]   
2        Absolutely fantastic service!  [absolutely, fantastic, service]   
3     I am not happy with the quality.                  [happy, quality]   
4           Best purchase I have made!            [best, purchase, made]   

            cleaned_text_joined  
0          love product amazing  
1         worst experience ever  
2  absolutely fantastic service  
3                 happy quality  
4            best purchase made  
Accuracy using Bag of Words: 0.0
Accuracy using TF-IDF: 0.0


## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [6]:
!pip install nltk
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')  # Download for tokenization

# Sample dataset
data = {
    'text': [
        'I love this product! It is amazing.',
        'This is the worst experience ever.',
        'Absolutely fantastic service!',
        'I am not happy with the quality.',
        'Best purchase I have made!'
    ],
    'sentiment': ['positive', 'negative', 'positive', 'negative', 'positive']
}
df = pd.DataFrame(data)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Text cleaning function (same as before)
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['cleaned_text'] = df['text'].apply(clean_text)
df['cleaned_text_joined'] = df['cleaned_text'].apply(lambda x: ' '.join(x))

# Bag of Words (BoW) and TF-IDF vectorization
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['cleaned_text_joined'])

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text_joined'])

# Split the data (same as before)
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['sentiment'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Train the models (same as before)
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

# Function to predict sentiment (same as before)
def predict_sentiment(user_text, model, vectorizer, stop_words):
    user_text = user_text.lower()
    user_text = user_text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(user_text)
    cleaned_tokens = [word for word in tokens if word not in stop_words]
    if not cleaned_tokens:
        return "Neutral"
    cleaned_text = ' '.join(cleaned_tokens)
    vectorized_text = vectorizer.transform([cleaned_text])
    prediction = model.predict(vectorized_text)
    return prediction[0]

# Get user input and predict sentiment
user_input = input("Enter your text for sentiment prediction: ")

# Predict using BoW and TF-IDF
predicted_sentiment_bow = predict_sentiment(user_input, model_bow, bow_vectorizer, stop_words)
predicted_sentiment_tfidf = predict_sentiment(user_input, model_tfidf, tfidf_vectorizer, stop_words)

# Print predictions
print("Predicted Sentiment (Bag of Words):", predicted_sentiment_bow)
print("Predicted Sentiment (TF-IDF):", predicted_sentiment_tfidf)





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Enter your text for sentiment prediction:     'Love it! Will buy again.'
Predicted Sentiment (Bag of Words): positive
Predicted Sentiment (TF-IDF): positive
