In [145]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
import sys
from sklearn.metrics import multilabel_confusion_matrix

np.set_printoptions(threshold=sys.maxsize)

import re

%matplotlib inline

## Load Data

In [2]:
stop_words = set(stopwords.words('english'))
len(stop_words)

179

In [3]:
data = pd.read_json('data/nepali-movies.json')

In [4]:
data.head()

Unnamed: 0,genre,imdb_url,plot,rating,runtime,title,votes,year
0,"Documentary, Adventure, Drama",https://www.imdb.com/title/tt1999130/,A team of 20 elite Nepali climbers venture int...,8.0,99 min,Death Zone: Cleaning Mount Everest,62.0,2018
1,"Comedy, Drama",https://www.imdb.com/title/tt7229666/,Chhakka Panja 2 continues with new story of Ra...,7.1,138 min,Chhakka Panja 2,261.0,2017
2,Comedy,https://www.imdb.com/title/tt8393764/,When she learns about the worst condition of t...,6.5,,Chhakka Panja 3,86.0,2018
3,"Drama, Romance",https://www.imdb.com/title/tt9812236/,Add a Plot,,,Love Station,,2019
4,"Drama, History",https://www.imdb.com/title/tt3700482/,"After her husband's death, a girl is forced to...",7.6,90 min,Jhola,244.0,2013


In [5]:
data.shape

(777, 8)

In [6]:
data.isnull().sum()

genre        77
imdb_url      0
plot          0
rating      533
runtime     406
title         0
votes       533
year          3
dtype: int64

## Pre-process Columns

In [7]:
df = data[['plot', 'genre']].dropna()

In [8]:
df.head()

Unnamed: 0,plot,genre
0,A team of 20 elite Nepali climbers venture int...,"Documentary, Adventure, Drama"
1,Chhakka Panja 2 continues with new story of Ra...,"Comedy, Drama"
2,When she learns about the worst condition of t...,Comedy
3,Add a Plot,"Drama, Romance"
4,"After her husband's death, a girl is forced to...","Drama, History"


In [9]:
df.shape

(700, 2)

In [10]:
df.isnull().sum()

plot     0
genre    0
dtype: int64

#### Remove data without any plot

In [11]:
df[df['plot'] == 'Add a Plot'].shape

(284, 2)

In [12]:
df = df[~df['plot'].str.contains("Add a Plot")]

In [13]:
df.shape

(416, 2)

In [14]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', ' <url> ', text)
    text = re.sub(r'#+', ' <hashtag> ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' <user> ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)

    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"don't", "do not ", text)
    text = re.sub(r"did't", "did not ", text)
    text = re.sub(r"shouldn't", "should not ", text)
    text = re.sub(r"wouldn't", "would not ", text)
    text = re.sub(r"hadn't", "had not ", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)

    text = re.sub(r"dont", " do not", text)
    text = re.sub(r"didnt", " did not", text)
    text = re.sub(r"wont", " will not", text)
    text = re.sub(r"cant", " can not", text)

    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' <number> ', text)
    text = re.sub('\s+url\s+', ' <url> ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [15]:
df['plot_clean'] = df['plot'].apply(process_text)

In [16]:
df.head()

Unnamed: 0,plot,genre,plot_clean
0,A team of 20 elite Nepali climbers venture int...,"Documentary, Adventure, Drama",a team of <number> elite nepali climbers ventu...
1,Chhakka Panja 2 continues with new story of Ra...,"Comedy, Drama",chhakka panja <number> continues with new stor...
2,When she learns about the worst condition of t...,Comedy,when she learns about the worst condition of t...
4,"After her husband's death, a girl is forced to...","Drama, History",after her husband is death a girl is forced to...
5,An excellent portrayal of a struggle of a comm...,Drama,an excellent portrayal of a struggle of a comm...


## Extract Features
Use TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer(
#     stop_words=stop_words,
    ngram_range=(1, 2),
    stop_words='english'
)

In [19]:
X = vectorizer.fit_transform(df['plot_clean'])

In [20]:
X.shape

(416, 7707)

## Encode Labels
Since the data has multiple labels per movie plot, multi hot binarizer is used.

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

In [22]:
labels = [genre.split(',') for genre in df['genre'].values]
labels[:10]

[['Documentary', ' Adventure', ' Drama'],
 ['Comedy', ' Drama'],
 ['Comedy'],
 ['Drama', ' History'],
 ['Drama'],
 ['Comedy', ' Drama'],
 ['Drama', ' Romance'],
 ['Drama'],
 ['Action', ' Drama'],
 ['Drama', ' Romance']]

In [23]:
very_hot = MultiLabelBinarizer()

In [24]:
Y = very_hot.fit_transform(labels)

In [25]:
Y

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
Y.shape

(416, 41)

In [27]:
very_hot.classes_

array([' Action', ' Adventure', ' Animation', ' Biography', ' Comedy',
       ' Crime', ' Drama', ' Family', ' Fantasy', ' History', ' Horror',
       ' Music', ' Musical', ' Mystery', ' News', ' Reality-TV',
       ' Romance', ' Sci-Fi', ' Short', ' Sport', ' Thriller', ' War',
       ' Western', 'Action', 'Adventure', 'Animation', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Game-Show',
       'History', 'Horror', 'Music', 'Musical', 'Reality-TV', 'Romance',
       'Short', 'Thriller'], dtype=object)

## Create Classifier

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [53]:
# model = MultinomialNB(alpha=0.1)
model = OneVsRestClassifier(MultinomialNB(alpha=0.1))
# model = OneVsRestClassifier(SVC())

In [54]:
model.fit(X, Y)

OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [147]:
Y_pred = model.predict(X)

In [148]:
Y_pred.sum(axis=1)

array([3, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 3, 1, 2, 2, 3, 3, 2, 3, 3, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 0, 2, 1, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 2, 1, 2, 3, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 2, 3,
       2, 3, 3, 1, 1, 1, 2, 3, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 3, 2,
       1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 3, 3, 2, 1, 2, 2, 2, 1, 3, 2, 1, 1,
       1, 2, 2, 1, 1, 2, 1, 2, 2, 3, 1, 1, 1, 2, 2, 3, 2, 2, 1, 1, 1, 1,
       2, 3, 2, 3, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 3, 1, 2, 3,
       1, 2, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 2, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 1, 2, 3, 1, 2, 1, 2, 1, 1, 2, 3, 1, 3, 2, 1, 1, 1,
       2, 1, 2, 1, 1, 2, 1, 3, 2, 3, 2, 2, 1, 1, 1, 2, 2, 3, 2, 3, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 3, 2, 2, 3, 2, 1, 1, 1, 3, 1, 2, 2,
       1, 1, 1, 1, 1, 2, 2, 2, 1, 3, 2, 2, 1, 2, 3,

In [149]:
multilabel_confusion_matrix(Y, Y_pred)

array([[[412,   0],
        [  0,   4]],

       [[399,   0],
        [  0,  17]],

       [[415,   0],
        [  0,   1]],

       [[406,   0],
        [  0,  10]],

       [[409,   0],
        [  0,   7]],

       [[410,   0],
        [  0,   6]],

       [[291,   0],
        [  0, 125]],

       [[398,   0],
        [  0,  18]],

       [[413,   0],
        [  0,   3]],

       [[405,   0],
        [  1,  10]],

       [[411,   0],
        [  0,   5]],

       [[409,   0],
        [  0,   7]],

       [[407,   0],
        [  0,   9]],

       [[405,   0],
        [  0,  11]],

       [[414,   0],
        [  0,   2]],

       [[415,   0],
        [  0,   1]],

       [[375,   0],
        [  0,  41]],

       [[413,   0],
        [  0,   3]],

       [[370,   0],
        [  0,  46]],

       [[413,   0],
        [  0,   3]],

       [[403,   0],
        [  0,  13]],

       [[414,   0],
        [  0,   2]],

       [[415,   0],
        [  0,   1]],

       [[388,   0],
        [  0, 

## Test

In [151]:
# text = "a man stalks a girl after while they fall in love"
text = "a man falls in love with a girl from another country when she visits nepal"
text = process_text(text)

In [152]:
features = vectorizer.transform([text])

In [153]:
features.shape

(1, 7707)

In [154]:
predictions = model.predict(features)

In [155]:
predictions

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [158]:
indices = predictions == 1

In [160]:
very_hot.classes_.reshape(1, -1)[indices]

array(['Drama'], dtype=object)