# TP2, NLP

In [2]:
## Training a unigram part-of-speech tagger

In [3]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [4]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

--------------------------------------------------------------------------------------------------------------------------------

## Practical work

In [7]:
## Extraction of the files

In [51]:
import tarfile

# Extract review_polarity.tar.gz
with tarfile.open(r'C:\Users\Khadi\OneDrive\Documents\ESILV\A5\NLP\TP2_NLP\review_polarity.tar.gz', 'r:gz') as tar:
    tar.extractall()
    print("Files are extracted")

Files are extracted


In [10]:
## 1.2 Load and Pre-process the Data

In [53]:
import os

def load_reviews(path):
    reviews = []
    labels = []
    for label in ['pos', 'neg']:
        directory = f"{path}/{label}"
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                with open(f"{directory}/{filename}", 'r', encoding='utf-8') as file:
                    reviews.append(file.read())
                    labels.append(1 if label == 'pos' else 0)
    return reviews, labels

# Assuming the data is extracted to "txt_sentoken"
reviews, labels = load_reviews('txt_sentoken')

In [11]:
# 2. POS Tagging and Sentiment Analysis

In [12]:
## 2.1 Train a Unigram POS Tagger

In [13]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

In [14]:
## 2.2 POS-Tagging for Reviews

In [54]:
tagged_reviews = [tagger.tag(nltk.word_tokenize(review)) for review in reviews]

In [16]:
## 2.3 Sentiment Analysis with SentiWordNet

In [55]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet

# def get_sentiment_score(tagged_review):
#     score = 0
#     for word, tag in tagged_review:
#         wn_tag = get_wordnet_pos(tag)
#         if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB):
#             continue
#         lemma = wordnet.morphy(word, wn_tag)
#         if not lemma:
#             continue
#         synsets = wordnet.synsets(lemma, pos=wn_tag)
#         if not synsets:
#             continue
#         synset = synsets[0]
#         swn_synset = swn.senti_synset(synset.name())
#         score += swn_synset.pos_score() - swn_synset.neg_score()
#     return score

def get_sentiment_score(tagged_review):
    score = 0
    for word, tag in tagged_review:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            continue
        lemma = wordnet.morphy(word, wn_tag)
        if not lemma:
            continue
        synsets = wordnet.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        print(f"Word: {lemma}, Pos Score: {swn_synset.pos_score()}, Neg Score: {swn_synset.neg_score()}")
        score += swn_synset.pos_score() - swn_synset.neg_score()
    return score

def get_wordnet_pos(treebank_tag):
    if treebank_tag is None:
        return None
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


# scores = [get_sentiment_score(review) for review in tagged_reviews]


In [20]:
# 3. Classification

In [56]:
classified_labels = ['Positive' if score > 0 else 'Negative' for score in scores]

In [None]:
# 4. Evaluation (Optional)

In [35]:
# print(len(classified_labels))
# print(len(labels))

In [40]:
# labels

In [41]:
# classified_labels

In [57]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(labels, classified_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.0


  score = y_true == y_pred


### Machine Learning part

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Example with Naive Bayes Classifier
def train_and_evaluate_model(reviews, labels):
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(reviews)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

    # Train the model
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

# Example usage:
reviews = ["This is a great movie", "This is a bad movie"]
labels = ["Positive", "Negative"]
train_and_evaluate_model(reviews, labels)

Accuracy: 0.0


In [60]:
## Why do we get an accuracy of 0.0 ?

In [61]:
# Print some diagnostics information
for i, review in enumerate(reviews[:10]):  # Check the first 10 reviews
    print(f"Review {i+1}:")
    print(f"Text: {review}")
    print(f"True label: {labels[i]}")
    print(f"Predicted label: {classified_labels[i]}")
    print(f"Sentiment score: {scores[i]}")
    print("-------------")

Review 1:
Text: This is a great movie
True label: Positive
Predicted label: Positive
Sentiment score: 7.125
-------------
Review 2:
Text: This is a bad movie
True label: Negative
Predicted label: Positive
Sentiment score: 7.375
-------------
