# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [19]:
import pandas as pd
import numpy as np

In [21]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    file_name = 'data/IMDB-top-1000.csv'

    df = pd.read_csv(file_name)

    collected_titles = df['movie name\r\n']

    return collected_titles.to_numpy()

In [75]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []

    i = 0
    while i < len(_tokens):
        movie_found = False
        for ne in _NE:
            ne_tokens = word_tokenize(ne)
            if ne_tokens[0] == tokens[i]:
                movie = [tokens[i]]
                for j in range(1, len(ne_tokens)):
                    if tokens[i+j] == ne_tokens[j]:
                        movie.append(tokens[i+j])
                if len(movie) == len(ne_tokens) and not tokens[i+len(ne_tokens)][0].isupper():
                    BIO_for_samples.append((movie[0], 'B-MOV'))
                    for part in ne_tokens[1:]:
                        BIO_for_samples.append((part, 'I-MOV'))
                    movie_found = True
            if movie_found:
                break
        if not movie_found:
            BIO_for_samples.append((tokens[i], 'O-MOV'))
            i+=1
        else:
            print('Movie Found! Title:', ' '.join(movie))
            i += len(ne_tokens)

    return BIO_for_samples

In [72]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [23]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [76]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("data/article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

Movie Found! Title: The Matrix
Movie Found! Title: Crouching Tiger , Hidden Dragon
Movie Found! Title: Hero
Movie Found! Title: Kung Fu Hustle
Movie Found! Title: Ip Man
('to', 'O-MOV') ('find', 'O-MOV') ('its', 'O-MOV') ('way', 'O-MOV') ('into', 'O-MOV') ('hits', 'O-MOV') ('like', 'O-MOV') ('The', 'B-MOV') ('Matrix', 'I-MOV') ('(', 'O-MOV') ('1999', 'O-MOV') (')', 'O-MOV') ('and', 'O-MOV') ('Kill', 'O-MOV') 
('.', 'O-MOV') ('In', 'O-MOV') ('2000', 'O-MOV') (',', 'O-MOV') ('the', 'O-MOV') ('Chinese', 'O-MOV') ('blockbuster', 'O-MOV') ('Crouching', 'B-MOV') ('Tiger', 'I-MOV') (',', 'I-MOV') ('Hidden', 'I-MOV') ('Dragon', 'I-MOV') ('showed', 'O-MOV') ('modern', 'O-MOV') 
('the', 'O-MOV') ('trend', 'O-MOV') ('.', 'O-MOV') ('Jet', 'O-MOV') ('Li', 'O-MOV') ('’', 'O-MOV') ('s', 'O-MOV') ('Hero', 'B-MOV') ('(', 'O-MOV') ('2002', 'O-MOV') (')', 'O-MOV') ('and', 'O-MOV') ('Fearless', 'O-MOV') ('(', 'O-MOV') 
('comedies', 'O-MOV') ('Shaolin', 'O-MOV') ('Soccer', 'O-MOV') ('(', 'O-MOV') ('2001', 