In [9]:
import nltk
import numpy as np
import random
import string # to process standard python strings
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import re

#nltk.download()
stop_words = set(stopwords.words('english')) 
porterStemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
 

In [10]:
def LemTokens(tokens):
    tokens = [porterStemmer.stem(token) for token in tokens]
    l = [lemmer.lemmatize(token) for token in tokens]
    return l
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    text = text.lower()
    text = text.translate(remove_punct_dict)
    l = nltk.word_tokenize(text)
    filtered_sentence = [w for w in l if not w in stop_words]
    return LemTokens(filtered_sentence)



com = set()
horr = set()
scifi = set()
crime = set()
drama = set()

filepath = "scifi.txt"
with open(filepath) as fp:
    line = fp.readline()
    while line:
        scifi.add(lemmer.lemmatize(line.rstrip()))
        line = fp.readline()
filepath = "crime.txt"
with open(filepath) as fp:
    line = fp.readline()
    while line:
        crime.add(lemmer.lemmatize(line.rstrip()))
        line = fp.readline()
filepath = "drama.txt"
with open(filepath) as fp:
    line = fp.readline()
    while line:
        drama.add(lemmer.lemmatize(line.rstrip()))
        line = fp.readline()


filepath = "horror.txt"
with open(filepath) as fp:
    line = fp.readline()
    while line:
        horr.add(lemmer.lemmatize(line.rstrip()))
        line = fp.readline()
                 
filepath = "comedy.txt"
with open(filepath) as fp:
    line = fp.readline()
    while line:
        com.add(lemmer.lemmatize(line.rstrip()))
        line = fp.readline()
        

train = [
    
    (dict([('contains-word(%s)' % w, True) for w in com]), 'comedy'),
    (dict([('contains-word(%s)' % w, True) for w in horr]), 'horror'),
    (dict([('contains-word(%s)' % w, True) for w in crime]), 'crime'),
    (dict([('contains-word(%s)' % w, True) for w in drama]), 'drama'),
    (dict([('contains-word(%s)' % w, True) for w in scifi]), 'scifi'),
    ]
test = [
    (dict([('contains-word(%s)' % w, True) for w in ["art", "drollery"]])),
    (dict([('contains-word(%s)' % w, True) for w in ["scary", "danger"]])),
    (dict([('contains-word(%s)' % w, True) for w in ["gun", "police"]]))

    ]


classifier = nltk.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
#classifier.classify_many(test)
#for pdist in classifier.prob_classify_many(test):
    #print('%.4f %.4f %.4f %.4f %.4f' % (pdist.prob('horror'), pdist.prob('comedy'),pdist.prob('crime'),pdist.prob('scifi'),pdist.prob('drama')))

    
vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')


GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey")
GREETING_RESPONSES = ["hi", "hey", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]



In [11]:
crimeMovie = {}
dramaMovie = {}
comedyMovie = {}
horrorMovie = {}
scifiMovie = {}



d = pd.read_csv('movie3.csv')
for x,y in zip(d['overview'],d['title']):
    words = LemNormalize(x)
    testWords =[(dict([('contains-word(%s)' % w, True) for w in words]))]
    for pdist in classifier.prob_classify_many(testWords):
        #print('%.4f %.4f %.4f %.4f %.4f' % (pdist.prob('horror'), pdist.prob('comedy'),pdist.prob('crime'),pdist.prob('scifi'),pdist.prob('drama')))
        coeff = max((pdist.prob('horror'), pdist.prob('comedy'),pdist.prob('crime'),pdist.prob('scifi'),pdist.prob('drama')))
        if coeff == pdist.prob('horror'):
            horrorMovie[y] = x
        elif coeff == pdist.prob('comedy'):
            comedyMovie[y] = x
        elif coeff == pdist.prob('scifi'):
            scifiMovie[y] = x
        elif coeff == pdist.prob('crime'):
            crimeMovie[y] = x
        else:
            dramaMovie[y] = x
            

            
    

In [12]:

def searchGenre(text, gList):
    best= []
    for x in gList:
        sim = cosine_sim(text,gList[x])
        if sim>0:
            best.append((sim, x))

    return best




def response(user_response):
    bot_response=''
    most_possible = [] 
    words = LemNormalize(user_response)
    testWords =[(dict([('contains-word(%s)' % w, True) for w in words]))]
    for pdist in classifier.prob_classify_many(testWords):
        coeff = max((pdist.prob('horror'), pdist.prob('comedy'),pdist.prob('crime'),pdist.prob('scifi'),pdist.prob('drama')))
        if coeff == pdist.prob('horror'):
            most_possible = searchGenre(user_response,horrorMovie)
        elif coeff == pdist.prob('comedy'):
            most_possible = searchGenre(user_response,comedyMovie)
        elif coeff == pdist.prob('scifi'):
            most_possible = searchGenre(user_response,scifiMovie)
        elif coeff == pdist.prob('crime'):
            most_possible = searchGenre(user_response,crimeMovie)
        else:
            most_possible = searchGenre(user_response,dramaMovie)
            
    most_possible = sorted(most_possible,reverse = True)
    return most_possible[:3]




In [13]:
flag=True
print("Bot: Give description about movie. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    
    if(user_response!='bye' and user_response!='no'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("Bot: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("Bot: "+greeting(user_response))
            else:
                print("Bot: ",end="")
                l = response(user_response)
                if not l:
                    print("Sorry! No reference.")
                else:
                    for film in l:
                        print(film[1],end =", "),
                    print('\n',response(user_response))
                    print("")
                print("Can I help with any other film?")
    else:
        flag=False
        print("Bot: Bye!")

Bot: Give description about movie. If you want to exit, type Bye!
hello
Bot: hi
big killer with knife and ghost christmas
Bot: Striking Distance, Once Upon a Time in America, The Manchurian Candidate, 
 [(0.06612665654870567, 'Striking Distance'), (0.06261451546913586, 'Once Upon a Time in America'), (0.05699752852140605, 'The Manchurian Candidate')]

Can I help with any other film?
no
Bot: Bye!
