## Group No : 77

## Group Member Names:
1. SHAILESH KUMAR SINGH
2. JAWAHARLAL RAJAN S


# NLP Assignment 1

This notebook covers the tasks outlined in the problem statement for processing the given dataset. Each step includes explanations, code implementation, and justifications for the output.

### Tasks Overview:
1. Cleaning: Removing punctuation, numbers, and special characters. Eliminating stop words.
2. Normalization: Reducing words to their base or root form using stemming or lemmatization.
3. POS Tagging:
   - Frequency of POS tags.
   - Most common POS tags.
   - Sentences containing specific POS tags.
4. Visualizations: Representing POS tag frequencies with bar charts and word clouds.
5. HMM POS Tagging: Applying to the first 4 rows.
6. POS and NER Tagging Analysis.


Import Libraries and Download Resources

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


Load and Preview the Dataset

This cell loads the dataset from the given URL and previews its structure. The script ensures that the dataset has a 'text' column for further processing.

In [None]:
# Importing necessary libraries
import pandas as pd

# Load the dataset
url = "https://drive.google.com/uc?export=download&id=1MrOqsO6HajwraBGnvcu9NLoI63c5DvcE"  # Adjusted link for direct download
df = pd.read_csv(url)

# Display the first few rows
print("Dataset Overview:")
df.head()


### Data Cleaning
This cell cleans the dataset by removing punctuation, numbers, special characters, and stopwords. The cleaned text is stored in a new column.

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords

# Preprocess the text column (assuming 'body' is the column with the reviews)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove punctuation and numbers
    text = re.sub(f"[{string.punctuation}0-9]", "", text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Convert to lowercase and remove stop words
    text = ' '.join([word for word in text.lower().split() if word not in stop_words])
    return text

# Apply the cleaning function
df['cleaned_body'] = df['body'].apply(clean_text)


## Normalization

This cell normalizes the cleaned text by reducing words to their base form using lemmatization.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


In [None]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Helper function to map POS tag to WordNet format
def get_pos_tag(word):
    tag = pos_tag([word])[0][1]
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun if unknown

# Lemmatization function
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_text = [
        lemmatizer.lemmatize(word, get_pos_tag(word)) for word in tokens
    ]
    return ' '.join(lemmatized_text)

# Apply lemmatization to the cleaned text
df['lemmatized_body'] = df['cleaned_body'].apply(lemmatize_text)


POS Tagging and Analysis

In [None]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

# Apply POS tagging
df['pos_tags'] = df['lemmatized_body'].apply(pos_tagging)


### Extract Sentences with Specific POS Tags

This cell extracts and displays sentences containing specific POS tags, such as nouns and verbs.

In [None]:
# Function to extract sentences containing a specific POS tag
def extract_sentences_with_pos(pos, text_column):
    result = []
    for text in text_column:
        doc = nlp(text)
        if any(token.pos_ == pos for token in doc):
            result.append(text)
    return result

# Extract sentences with nouns and verbs
nouns_sentences = extract_sentences_with_pos("NOUN", df['normalized_text'])
verbs_sentences = extract_sentences_with_pos("VERB", df['normalized_text'])

# Display sample sentences
print("Sentences containing nouns (First 5):")
print(nouns_sentences[:5])

print("Sentences containing verbs (First 5):")
print(verbs_sentences[:5])


### Visualizations

This cell generates visualizations (bar chart and word cloud) to represent the frequencies of POS tags.

In [None]:
# Bar chart for POS tag frequencies
plt.figure(figsize=(10, 6))
plt.bar(pos_counts.keys(), pos_counts.values(), color='skyblue')
plt.title("POS Tag Frequencies")
plt.xlabel("POS Tags")
plt.ylabel("Frequency")
plt.show()

# Word cloud for POS tag frequencies
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(pos_counts)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


### HMM POS Tagging on First 4 Rows

This cell performs HMM POS tagging on the first four rows of the dataset.

In [None]:
# Import HMM POS Tagger from NLTK
from nltk.tag import hmm

# Function for HMM POS tagging
def hmm_pos_tagging(text):
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    return tagged

# Apply HMM POS tagging to the first 4 rows
first_four_rows = df['normalized_text'][:4].apply(hmm_pos_tagging)
print("HMM POS Tagging for the first 4 rows:")
print(first_four_rows)


### POS and NER Tagging

This cell compares POS and NER tags for selected sentences to identify conflicts or complementarities.

In [None]:
# Function for NER tagging
def ner_tagging(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Analyze POS and NER tags for sample sentences
sample_sentences = df['normalized_text'][:5]
pos_ner_analysis = {}

for sentence in sample_sentences:
    pos_tags = pos_tagging(sentence)
    ner_tags = ner_tagging(sentence)
    pos_ner_analysis[sentence] = {"POS Tags": pos_tags, "NER Tags": ner_tags}

# Display analysis
print("POS and NER Tagging Analysis:")
for sent, analysis in pos_ner_analysis.items():
    print(f"Sentence: {sent}")
    print(f"POS Tags: {analysis['POS Tags']}")
    print(f"NER Tags: {analysis['NER Tags']}")

# Identify conflicting/complementary information
print("Conflicting/Complementary Analysis:")
for sent, analysis in pos_ner_analysis.items():
    pos_entities = {token for token, tag in analysis['POS Tags'] if tag == 'NOUN'}
    ner_entities = {entity for entity, label in analysis['NER Tags']}
    conflicting = pos_entities.difference(ner_entities)
    complementary = pos_entities.intersection(ner_entities)
    print(f"Sentence: {sent}")
    print(f"Conflicting Entities: {conflicting}")
    print(f"Complementary Entities: {complementary}")
