# NLIMDb Classifier

###### Import the Necessary Libraries

In [1]:
import nltk
import sqlite3
import sklearn
import pandas as pd
import numpy as np

###### Explore the Structure of the Database

In [2]:
# The Database file must be included in the same directory as the notebook

# Connects to the database
conn = sqlite3.connect('NLIM.db')

c = conn.cursor()

# Selects all the Tables from the database
tables = c.execute('SELECT * FROM sqlite_master where type = \'table\';').fetchall()
for row in tables:
    # Prints the Number of rows in each table
    row_count = c.execute('SELECT count(*) FROM {};'.format(row[2])).fetchone()[0]
    print('The {} table contains {} rows'.format(row[1], row_count))
    
    # Prints the Schema for the database
    print('Schema: {}'.format(row[4]))
    
    print('\n')

The people table contains 571952 rows
Schema: CREATE TABLE people (
	birthday DATE, 
	known_for_department TEXT NOT NULL, 
	deathday TEXT, 
	id INTEGER NOT NULL, 
	name TEXT NOT NULL, 
	gender INTEGER NOT NULL, 
	biography TEXT NOT NULL, 
	popularity FLOAT NOT NULL, 
	place_of_birth TEXT, 
	profile_path TEXT, 
	PRIMARY KEY (id)
)


The movies table contains 406526 rows
Schema: CREATE TABLE movies (
	adult BOOLEAN NOT NULL, 
	backdrop_path TEXT, 
	budget INTEGER NOT NULL, 
	id INTEGER NOT NULL, 
	overview TEXT, 
	popularity FLOAT NOT NULL, 
	poster_path TEXT, 
	release_date DATE, 
	revenue INTEGER NOT NULL, 
	runtime INTEGER, 
	tagline TEXT, 
	title TEXT NOT NULL, 
	vote_average FLOAT NOT NULL, 
	vote_count INTEGER NOT NULL, 
	PRIMARY KEY (id), 
	CHECK (adult IN (0, 1))
)


The movie_series table contains 3012 rows
Schema: CREATE TABLE movie_series (
	id INTEGER NOT NULL, 
	name TEXT NOT NULL, 
	overview TEXT NOT NULL, 
	poster_path TEXT, 
	backdrop_path TEXT, 
	PRIMARY KEY (id)
)


The

###### Explore the Collected Queries

In [3]:
# Reads the CSV of the queries retrieved from the survery
# These queries have been annotated by table-column as classes
df = pd.read_csv("Movie Search Questionnaire.csv")

# Prints the Frequency counts of the different classes
print(df.Class.value_counts())

# Prints the first 10 rows
df.head(10)

unknown                        13
genres-name                    12
people-name                    10
movies-release-date             7
movies-name                     3
movies-popularity               3
movie-series-name               2
people-known-for-department     1
person-name                     1
movies-length                   1
movies-runtime                  1
people-gender                   1
Name: Class, dtype: int64


Unnamed: 0,Query,Class
0,Movie that will make me cry,genres-name
1,Short movie with a good story about a fish,movies-runtime
2,Movie that has theme song let it go,unknown
3,Movies for a girls night in,unknown
4,Movies about spies and con artists,genres-name
5,Movies like Sixteen Candles,movies-name
6,Classic comedy movies,genres-name
7,Classic horror films,genres-name
8,Movie where Dwayne Johnson is climbing a skysc...,unknown
9,Recently released movies,movies-release-date


###### Preprocess Queries

In [4]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import StanfordNERTagger

# Initializes the Word Net Lemmatizer
wnl = nltk.WordNetLemmatizer()

# Initializes the Stanford NER Tagger
# requires for the files described below to be in the same directory as the notebook
st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
                       'stanford-ner-2018-10-16/stanford-ner.jar',
                       encoding='utf-8')

    
# The Stanford Named Entitry Recognizer's performance suffers from turning all the text to lower case
# Thus we run it first and lowercase any unrecongnized entities
df['Query'] = df['Query'].apply(lambda x: [t[0].lower() if t[1] == 'O' else t[0] for t in st.tag(word_tokenize(x))])
    
df['Query'] = df['Query'].apply(lambda x: [wnl.lemmatize(t) for t in x])
    
df.head(10)

[nltk_data] Downloading package punkt to /Users/youssefe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/youssefe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/youssefe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,Query,Class
0,"[movie, that, will, make, me, cry]",genres-name
1,"[short, movie, with, a, good, story, about, a,...",movies-runtime
2,"[movie, that, ha, theme, song, let, it, go]",unknown
3,"[movie, for, a, girl, night, in]",unknown
4,"[movie, about, spy, and, con, artist]",genres-name
5,"[movie, like, sixteen, candle]",movies-name
6,"[classic, comedy, movie]",genres-name
7,"[classic, horror, film]",genres-name
8,"[movie, where, Dwayne, Johnson, is, climbing, ...",unknown
9,"[recently, released, movie]",movies-release-date


#### Explore Classification Methods

###### Naive Bayes

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# This converts the list of words back into space-separated strings
df['Query'] = df['Query'].apply(lambda x: ' '.join(x))

In [6]:
# Frequency counts of words
count_vect = CountVectorizer()  
counts = count_vect.fit_transform(df['Query']) 

# Transforms count into tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

X_train, X_test, y_train, y_test = train_test_split(counts, df['Class'], test_size=0.1)

model = MultinomialNB().fit(X_train, y_train)
    
predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.16666666666666666


###### Naive Bayes with Stop Words Removed

In [7]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Make a copy of the dataframe and remove the stopwords
df_sw = df.copy(True)
df_sw['Query'] = df_sw['Query'].apply(lambda x: ' '.join([t for t in word_tokenize(x) if t not in stop_words]))

# Frequency counts of words
count_vect = CountVectorizer()  
counts = count_vect.fit_transform(df_sw['Query']) 

# Transforms count into tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

X_train, X_test, y_train, y_test = train_test_split(counts, df_sw['Class'], test_size=0.1)

model = MultinomialNB().fit(X_train, y_train)
        
predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.5


###### Support Vector Machine

In [19]:
from sklearn.linear_model import SGDClassifier

# Frequency counts of words
count_vect = CountVectorizer()  
counts = count_vect.fit_transform(df['Query']) 

# Transforms count into tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

X_train, X_test, y_train, y_test = train_test_split(counts, df['Class'], test_size=0.1)

model = SGDClassifier().fit(X_train, y_train)
    
predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.5
