# IMDb reviews

In [1]:
import string
import re
import math
import collections
import random

FILENAME = './datasets/IMDb_reviews.txt'
NUMBER_OF_DOCUMENTS = -1

"""
Goal: obtaining the following data structure

|===============|
|==> reviews <==|
|===============|

[
    {
        'text': "plain text of rvw",  # string
        'is_positive': True,          # bool
        'tokens': [                   # list of strings
                    't1',
                    't1',
                    't2',
                    ...
                  ]
        'tf':     {                   # dictionary | key:token | val:occurrence
                    't1': 2,
                    't2': 1,
                    ...
                  }
        'tf_idf': {                   # dictionary | key:token | val:weight
                    't1': 3.2,
                    't2': 6.2,
                    ...
                  }
    },
    ...
]

|==============| All tokens
|==> tokens <==| of all documents
|==============| without duplicates

[
    'token_1',
    'token_2',
    'token_3',
    ...
]

|==========================| Number of document in which
|==> document_frequency <==| a given token
|==========================| appears at least once

{
    'token_1': 2,
    'token_2': 1,
    'token_3': 19,
    ...
}

|==================================| Helps to find significance
|==> inverse_document_frequency <==| of a token among the entire
|==================================| collection of documents
{
    'token_1': 10.15,
    'token_2': 11.32,
    'token_3': 4.68,
    ...
}

"""



### 1. Load the dataset

In [2]:
with open(FILENAME, encoding="utf8") as f:
    # Ignore the first line
    f.readline()

    # Initialize the reviews list
    reviews = []
    
    # Each row in the file contains a review and an integer (1: positive, 0: negative)
    for review in f:
        try:
            review_text_match = re.findall(r'(.+),(\d)', review)
            review_text = review_text_match[0][0].replace('""', '"') 

            reviews.append({
                'text': review_text,
                'is_positive': (review_text_match[0][1] == '1')
            })
        except:
            print('>>ERROR with following review:')
            print(review)


NUMBER_OF_DOCUMENTS = len(reviews)

### 2. Tokenize

In [3]:
# Add 'tokens' list to each review

def getTokensFromDocument(document):
    """
    Compute the tokens for a given document
    
    Input: a string. It is a document to tokenize
    Output: a list. It contains the tokens of the relative document
    """

    for punct in string.punctuation:
        document = document.replace(punct, " ")
    tokens = [ token.lower() for token in document.split(" ") if token ]
    
    return tokens

for review in reviews:
    review['tokens'] = getTokensFromDocument(review['text'])

### 3. Term Frequency (TF)

In [4]:
# Add 'tf' dictionary to each review

def getTF(tokens):
    """Get the Time Frequency for each token"""
    tf = {}
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    return tf

for review in reviews:
    review['tf'] = getTF(review['tokens'])

### 4. Document Frequency (DF) and Inverse DF (IDF)

In [5]:
"""
N: number of documents
DF(t): document frequency of a token (the number of documents in which t appears at least once)

IDF(t) = log( N / DF(t) )
"""

'\nN: number of documents\nDF(t): document frequency of a token (the number of documents in which t appears at least once)\n\nIDF(t) = log( N / DF(t) )\n'

#### 4.a. Document Frequency

In [6]:
def getDFDictionary(documents):
    """
    Input: a list of documents. It is required to have 'tf' in the document
    Output: a dictionary. It contains token as keys and DF(token) as value
    """
    DF_dictionary = {}
    
    # Foreach document
    for document in documents:
        time_frequency = document['tf']
        
        # Foreach token in the time_frequency dictionary
        for token in time_frequency:
            
            # If the current token was already added before
            if token in DF_dictionary:
                DF_dictionary[token] += 1
                
            else:
                DF_dictionary[token] = 1
                
    return DF_dictionary

# DF
"""
To get the DF of a token:
    document_frequency['token_string']
"""
document_frequency = getDFDictionary(reviews)

#### 4.b. Inverse Document Frequency

In [7]:
def getIDFDictionary(DF_dictionary, N):
    """
    Input: a dictionary (DF), an integer (number of documents)
    Output: a dictionary. It contains token as keys and IDF(token) as value
    """
    IDF_dictionary = {}
    
    # Foreach [key:token, val:DF(token)] in the DF dictionary
    for token, DF_t in DF_dictionary.items():
        IDF_dictionary[token] = math.log(N / DF_t)
    
    return IDF_dictionary

# IDF
"""
To get the IDF of a token:
    inverse_document_frequency['token_string']
"""
inverse_document_frequency = getIDFDictionary(document_frequency, NUMBER_OF_DOCUMENTS)

#### 4.c. Sorted IDF

In [8]:
# 10 lowest IDF
asc_sorted_IDF = sorted(inverse_document_frequency.items(), key=lambda kv: kv[1])

print('10 tokens with the lowest IDF (most common ones):')
for token in asc_sorted_IDF[:10]:
    print(" > " + token[0] + "\t\t- IDF: " + str(token[1]))
    
# 10 highest IDF
desc_10_sorted_IDF = asc_sorted_IDF[-10:]
desc_10_sorted_IDF.reverse()

print('10 tokens with the highest IDF (most uncommon ones):')
for token in desc_10_sorted_IDF:
    print(" > " + token[0] + "\t\t- IDF: " + str(token[1]))

10 tokens with the lowest IDF (most common ones):
 > the		- IDF: 0.008314469604085238
 > a		- IDF: 0.03351541933781697
 > and		- IDF: 0.03401190259170586
 > of		- IDF: 0.05226218466281087
 > to		- IDF: 0.06293979977387414
 > this		- IDF: 0.09924591465797242
 > is		- IDF: 0.1086102347240488
 > it		- IDF: 0.11536595914077863
 > in		- IDF: 0.12606221366364628
 > that		- IDF: 0.20722099077039452
10 tokens with the highest IDF (most uncommon ones):
 > capiche		- IDF: 10.126631103850338
 > camora		- IDF: 10.126631103850338
 > jowls		- IDF: 10.126631103850338
 > repleat		- IDF: 10.126631103850338
 > jayden		- IDF: 10.126631103850338
 > imy		- IDF: 10.126631103850338
 > orientalist		- IDF: 10.126631103850338
 > rouÃ©		- IDF: 10.126631103850338
 > infantalising		- IDF: 10.126631103850338
 > ant1		- IDF: 10.126631103850338


### 5. TF-IDF

In [9]:
for review in reviews:
    # Dictionary with tokens of the current document as key and TF-IDF for that token in this document as value
    current_TF_IDF = {}
    
    for token in review['tokens']:
        # Frequency of the current token in the current document
        current_TF = review['tf'][token]
        
        # IDF of the current token
        current_IDF = inverse_document_frequency[token]
        
        # Get the current TF-IDF(token, document) value
        current_TF_IDF[token] = current_TF * current_IDF
    
    review['tf_idf'] = current_TF_IDF

### 6. Sentiment analysis

In [10]:
def norm(d):
    """Compute the L2-norm of a vector representation"""
    return sum([tf_idf ** 2 for t, tf_idf in d.items() ])**.5

def dot_product(d1, d2):
    """Compute the dot product between two vector representations"""
    word_set = set(list(d1.keys()) + list(d2.keys()))
    return sum([ (d1.get(d, 0.0) * d2.get(d, 0.0)) for d in word_set ])

def cosine_similarity(d1, d2):
    """
    Compute the cosine similarity between documents d1 and d2.
    
    Input: two dictionaries representing the TF-IDF vectors for documents d1 and d2.
    Output: the cosine similarity.
    """
    return dot_product(d1, d2) / (norm(d1) * norm(d2))

In [11]:
def identifyLabel(test_document, include_similarity=False):
    # Compute the avg similarity by dividing the sum by the number of elements
    positive_similarity_sum = 0
    positive_similarity_cnt = 0

    negative_similarity_sum = 0
    negative_similarity_cnt = 0

    for review in reviews:

        # Do not consider the test document (it would be too easy)
        if test_document != review:
            current_tf_idf = review['tf_idf']

            if(review['is_positive']):
                positive_similarity_sum += cosine_similarity(test_tf_idf, current_tf_idf)
                positive_similarity_cnt += 1

            else:
                negative_similarity_sum += cosine_similarity(test_tf_idf, current_tf_idf)
                negative_similarity_cnt += 1

    # Compute the mean
    avg_positive_similarity = positive_similarity_sum / positive_similarity_cnt
    avg_negative_similarity = negative_similarity_sum / negative_similarity_cnt

    is_positive = (avg_positive_similarity > avg_negative_similarity)
    
    if include_similarity:
        return is_positive, avg_positive_similarity, avg_negative_similarity
    else:
        return is_positive


# Select a random review - we'll call it "test document"
test_document = reviews[random.randint(0, len(reviews)-1)]
test_tf_idf = test_document['tf_idf']

identified_positive, avg_positive_similarity, avg_negative_similarity = identifyLabel(test_document, True)

print("Positive similarity:", avg_positive_similarity)
print("Negative similarity:", avg_negative_similarity)

if(identified_positive):
    identified_label = 'positive'
else:
    identified_label = 'negative'

print("\n")
print("The super iper advanced artificial intelligence powered by deep learning and machine learning, working on a blockchain and analyzing big data says that the review was " + identified_label)

if(test_document['is_positive']):
    real_label = 'positive'
else:
    real_label = 'negative'
    
print("\n")
print("The truth is that it was " + real_label)

print("\n")
if(identified_label == real_label):
    print("Yeah, the AI is extreamly smart!")
else:
    print("Oh... maybe the truth is wrong?!?")

Positive similarity: 0.014762028785759492
Negative similarity: 0.013608276884642494


The super iper advanced artificial intelligence powered by deep learning and machine learning, working on a blockchain and analyzing big data says that the review was positive


The truth is that it was positive


Yeah, the AI is extreamly smart!


### Extra

To how many reviews would the correct label be assigned?

In [None]:
# Calculate the number of correct and wrong label assignations on all 25000 reviews
correct_cnt = 0
wrong_cnt = 0

i = 1

for review in reviews:
    
    identified_is_positive = identifyLabel(review)
    real_is_positive = review['is_positive']
    
    if(identified_is_positive == real_is_positive):
        correct_cnt += 1
    else:
        wrong_cnt += 1
    
    print(str(i) + " of " + str(NUMBER_OF_DOCUMENTS) + "\t" + str(correct_cnt) + " OK, " + str(wrong_cnt) + " KO")
    i += 1
    
print("Correct assignations: " + str(correct_cnt))
print("Wrong assignations: " + str(wrong_cnt))

1 of 25000	1 OK, 0 KO
2 of 25000	1 OK, 1 KO
3 of 25000	2 OK, 1 KO
4 of 25000	2 OK, 2 KO
5 of 25000	3 OK, 2 KO
6 of 25000	3 OK, 3 KO
7 of 25000	4 OK, 3 KO
8 of 25000	4 OK, 4 KO
9 of 25000	5 OK, 4 KO
10 of 25000	5 OK, 5 KO
11 of 25000	6 OK, 5 KO
12 of 25000	6 OK, 6 KO
13 of 25000	7 OK, 6 KO
14 of 25000	7 OK, 7 KO
15 of 25000	8 OK, 7 KO
16 of 25000	8 OK, 8 KO
17 of 25000	9 OK, 8 KO
