In [60]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer, SnowballStemmer
import re 

In [61]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bahar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bahar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bahar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Data

The IMDb dataset consists of movie reviews from the IMDb website, each labeled with a sentiment score indicating whether the review is positive (1) or negative (0). This sentiment score helps in sentiment analysis tasks such as determining the overall opinion expressed in the review.

The variable `data` comprises a list of dictionaries, each containing a `review` key with the movie's textual review and a `sentiment` key indicating whether the review is positive (1) or negative (0).

In [64]:
# Load CSV data into a pandas DataFrame
df = pd.read_csv('data/imdb_sentiment.csv')
# Convert the DataFrame to a list of dictionaries
data = df.to_dict(orient='records')
# Print the 5 first records
data[:5]

  'sentiment': 0},
 {'review': '"Footlight Parade" is just one of several wonderfully jaunty musicals that Warner Bros. produced in the early 1930\'s to ward off the Depression. "42nd Street" and the Golddiggers series were also produced during this era, and they made literally, millions of Americans forget their troubles for a little while, and enjoy themselves.While most of the films produced had the great talents of Joan Blondell, Ruby Keeler, and Dick Powell, only Foolight Parade had the incomparable James Cagney. Almost ten years prior to his most well-known musical, "Yankee Doodle Dandy". Here he dances in that most original of dance styles, with his arms usually lowered at his side, and his legs doing all types of undulations and kicks. It\'s easy to see that he is enjoying himself, and that makes us enjoy him all the more.While almost all of the musical sequences appear at the end of the film, they are well worth the wait. I believe that this film was made just prior to the ins

# Data Analysis

In [65]:
# How many reviews in total?
len(data)

1000

In [66]:
# TODO: How many positive and negative reviews?
positive_count = 0
negative_count = 0

# Count positive and negative reviews
for d in data:
    if d['sentiment'] == 1:
        positive_count += 1
    elif d['sentiment'] == 0:
        negative_count += 1

print('Number of positive reviews:', positive_count)
print('Number of negative reviews:', negative_count)

Number of positive reviews: 481
Number of negative reviews: 519


## Exercise 1: Text Preprocessing

To simplify the reviews' text, apply preprocessing techniques:
* **Tokenization**: Tokenize the text. It may be good to remove punctuation before or after the tokenization. You can use `string.punctuation` for this purpose, which contains a set of punctuation characters.
* Either **stemming** or **lemmatization**: Choose one, not both, as using both could lead to redundancy; to decide, experiment with both and select the method that better suits your needs.
* Remove **stop words**.

In [67]:
def tokenize(text):
    # Tokenize and remove punctuation
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

In [68]:
def filter_stop_words(tokens):
    # Get the list over stopwords fra NLTK
    stop_words = set(stopwords.words('english'))
    
    # Filtre stopwords from the tokenlist
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return filtered_tokens

In [69]:
def stem(tokens):
    # Initialiser one PorterStemmer
    stemmer = PorterStemmer()
    
    # Perform stemming on each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    return stemmed_tokens


In [70]:
def lemmatize(tokens):
    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return lemmatized_tokens


In [91]:
def process_text(text):
    # Tokenize the text
    tokens = tokenize(text.lower())
    
    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token.isalpha()]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem or lemmatize the tokens
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return stemmed_tokens, lemmatized_tokens


For each element in the `data` list, process the review's text and store the result in a new key called `tokens`. It may take a few seconds to do all the processing:

In [72]:
# Process reviews' text
for d in data:
    review_text = d.get('review', '')  # Retrieve the review text, default to empty string if key is missing
    tokens = process_text(review_text)  # Process the review text
    d['tokens'] = tokens  # Assign the processed tokens to a new key 'tokens' in the dictionary


## Exercise 2: Predict Positive or Negative Review

Develop a simple rule-based model that predicts whether a review is positive or negative based on the total number of positive and negative words in the review.

For example, if the review contains more positive words than negative words, predict that the review is positive.

In [73]:
# Define empty lists for negative and positive words
negative_words = []
positive_words = []

# Add more words to the lists
negative_words.extend([
    'horrible',
    'terrible',
    'disappointing',
    'bad'
    'mad'
])

positive_words.extend([
    'amazing',
    'great',
    'wonderful',
    'awesome'
    'nice'
])

# Apply stemming to the words
negative_words = stem(negative_words)
positive_words = stem(positive_words)


In [74]:
def count_words(tokens, words):
    '''
    Given a list of tokens and a list of words, return the total number of words
    from the `words` list that appear in the `tokens` list.

    For example:
        tokens = ['the', 'good', 'amazing', 'movie', 'was', 'good']
        words = ['good', 'amazing']
        returns: 3 ('good' appears twice, and 'amazing' appears once in the `tokens` list)
    '''
   # Initialize a counter for the total number of words
    count = 0
    
    # Iterate through each token in the tokens list
    for token in tokens:
        # If the token is in the words list, increment the count
        if token in words:
            count += 1
    
    return count

In [75]:
# Predict based on word count
for d in data:
    d['tokens'] = process_text(d['review'])

# Predict based on word count
for d in data:
    n_positive_words = count_words(d['tokens'], positive_words)
    n_negative_words = count_words(d['tokens'], negative_words)
    
    # Create a new key in `d` called 'prediction'
    if n_positive_words > n_negative_words:
        d['prediction'] = 1
    elif n_negative_words > n_positive_words:
        d['prediction'] = 0
    else:
        d['prediction'] = None

# Predicted sentiments
preds = [d['prediction'] for d in data]
# Real sentiments
real = [d['sentiment'] for d in data]

In [76]:
# Percentage of unknown predictions (i.e., when the prediction is None)
print(f'Unknown predictions: {sum([1 for p in preds if p is None]) / len(preds):.2%}')

# Accuracy of the known predictions, that is, the percentage of predictions that are correct
acc = sum([1 for i in range(len(preds)) if preds[i] is not None and preds[i] == real[i]]) / len(preds)
print(f'Accuracy: {acc:.2%}')

Unknown predictions: 100.00%
Accuracy: 0.00%


## Exercise 3: Most Common Adjectives

Get the most common adjectives in the positive and negative reviews.

In [77]:
from collections import Counter

In [78]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bahar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [79]:
# Lists of tokens for positive and negative reviews
positive_reviews = [d['tokens'] for d in data if d['sentiment'] == 1]
negative_reviews = [d['tokens'] for d in data if d['sentiment'] == 0]

In [80]:
from nltk import pos_tag, word_tokenize
from collections import Counter
def get_most_common_adjectives(reviews, n=10):
    adjs = []

    # For each review, get the adjectives and add them to the `adjs` list
    for r in reviews:
        tokens = word_tokenize(r)
        tagged_tokens = pos_tag(tokens)
        for word, tag in tagged_tokens:
            if tag.startswith('JJ'):  # JJ tags indicate adjectives
                adjs.append(word)
    
    # Count the number of occurrences and return the most common adjectives
    counts = Counter(adjs)
    return counts.most_common(n)

In [81]:
# Print the most common adjectives in positive and negative reviews
from nltk.tokenize import word_tokenize

def get_most_common_adjectives(reviews, n=10):
    adjs = []

    # For each review, get the adjectives and add them to the `adjs` list
    for r in reviews:
        # Ensure that each review is a string
        review_text = str(r)
        
        tokens = word_tokenize(review_text)
        tagged_tokens = pos_tag(tokens)
        for word, tag in tagged_tokens:
            if tag.startswith('JJ'):  # JJ tags indicate adjectives
                adjs.append(word)
    
    # Count the number of occurrences and return the most common adjectives
    counts = Counter(adjs)
    return counts.most_common(n)

pos_adjs = get_most_common_adjectives(positive_reviews)
neg_adjs = get_most_common_adjectives(negative_reviews)

print('Most common adjectives in positive reviews:')
print(pos_adjs)
print()
print('Most common adjectives in negative reviews:')
print(neg_adjs)

Most common adjectives in positive reviews:
[('[', 439), ("'much", 364), ("'best", 311), ("'interest", 109), ("'actual", 102), ('u', 93), ("'american", 91), ("'touch", 55), ("'usual", 46), ("'rest", 46)]

Most common adjectives in negative reviews:
[('[', 454), ("'much", 424), ("'best", 183), ("'actual", 126), ("'interest", 126), ("'rest", 108), ("'american", 98), ('u', 83), ("'obvious", 54), ("'terrible", 53)]


## Exercise 4: Most Common Nouns

Similar to the previous exercise, but instead of adjectives identify the most common nouns in positive and negative reviews.

Are the nouns similar between positive and negative reviews? Why do you think this is the case?

Considerations (there is no need to implement the following, it is just food for thought):
- If there are substantial differences, it might be beneficial to incorporate these nouns into the `positive_words` and `negative_words` lists.
- Conversely, if the nouns are largely consistent, it might be good and more efficient to exclude them from the tokenization process, as they may not significantly contribute to sentiment analysis.
    - One option for facilitating this process is to use TF-IDF scoring, as tokens with lower scores are less informative due to their prevalence across both positive and negative reviews.

In [82]:
def get_most_common_nouns(reviews, n=10):
    nouns = []

    # For each review, get the nouns and add them to the `nouns` list
    for review in reviews:
        review_text = str(review)  # Ensure review is a string
        tokens = word_tokenize(review_text)
        tagged_tokens = pos_tag(tokens)
        for word, tag in tagged_tokens:
            if tag.startswith('NN'):  # NN tags indicate nouns
                nouns.append(word)
    
    # Count the number of occurrences and return the most common nouns
    counts = Counter(nouns)
    return counts.most_common(n)

# Get the most common nouns in positive and negative reviews
pos_nouns = get_most_common_nouns(positive_reviews)
neg_nouns = get_most_common_nouns(negative_reviews)

# Print the most common nouns in positive reviews
print('Most common nouns in positive reviews:')
for noun, count in pos_nouns:
    print(noun, ':', count)

print()

# Print the most common nouns in negative reviews
print('Most common nouns in negative reviews:')
for noun, count in neg_nouns:
    print(noun, ':', count)


Most common nouns in positive reviews:
'film : 1819
] : 962
'movi : 790
'movie : 790
'time : 571
'good : 563
'great : 557
'see : 478
'get : 457
'well : 436

Most common nouns in negative reviews:
'film : 1755
'movi : 1143
'movie : 1143
] : 1038
'good : 636
'time : 622
'bad : 622
'get : 606
'make : 559
'see : 537


## Exercise 5: CountVectorizer

Use the `CountVectorizer` class from scikit-learn to convert the reviews into a matrix of token counts, so each review is represented by a vector of the count of each token.

In [83]:
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
reviews = [d['review'] for d in data]

In [85]:
# Create a CountVectorizer, fit and transform it based on the `reviews` list
vectorizer = CountVectorizer()

# Fit and transform the CountVectorizer based on the reviews
X = vectorizer.fit_transform(reviews)

# Print the vocabulary learned by the CountVectorizer
print("Vocabulary:", vectorizer.get_feature_names_out())

Vocabulary: ['00' '000' '007' ... 'émigrés' 'était' 'étoile']


## Machine Learning Example

Based on the previous vectorization based on `CountVectorizer`, we can use the vectors to train a machine learning model.

For example, we can use the computed `vectors` as input features and the `sentiment` as the target variable (that is the variable we want to predict).

In [86]:
from sklearn.linear_model import LogisticRegression

Separate into training and test sets.

- Training data is used for the model to learn from.
- Test data is used to evaluate the trained model's performance.

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have 'reviews' as your list of reviews
reviews = [d['review'] for d in data]

# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data to obtain TF-IDF vectors
vectors = tfidf_vectorizer.fit_transform(reviews)

# Split the vectors into training and test sets
train_vectors = vectors[:700]
test_vectors = vectors[700:]

# Extract the sentiment labels from the data for both the training and test sets
train_sentiments = [d['sentiment'] for d in data[:700]]
test_sentiments = [d['sentiment'] for d in data[700:]]


In [88]:
# Train the logistic regression model with the training data
# This creates a mapping between the vectors and the sentiment labels
lr = LogisticRegression(max_iter=1000)
lr.fit(train_vectors, train_sentiments)

In [89]:
# Use the trained model to predict the sentiment of the test data
preds = lr.predict(test_vectors)

In [90]:
# Print the accuracy of the model
acc = sum([1 for i in range(len(preds)) if preds[i] == test_sentiments[i]]) / len(preds)
print(f'Accuracy: {acc:.2%}')

Accuracy: 79.00%
