In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
import sys
from sklearn.metrics import multilabel_confusion_matrix

np.set_printoptions(threshold=sys.maxsize)

import re

%matplotlib inline

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


## Load Data

In [2]:
stop_words = set(stopwords.words('english'))
len(stop_words)

179

In [3]:
data = pd.read_json('data/nepali-movies.json')

In [4]:
data.head()

Unnamed: 0,genre,imdb_url,plot,rating,runtime,title,votes,year
0,"Comedy, Drama",https://www.imdb.com/title/tt7229666/,Chhakka Panja 2 continues with new story of Ra...,6.6,138 min,Chhakka Panja 2,290.0,2017
1,Comedy,https://www.imdb.com/title/tt8393764/,When she learns about the worst condition of t...,6.3,,Chhakka Panja 3,96.0,2018
2,Romance,https://www.imdb.com/title/tt7672868/,When Ishan (Dhiraj Magar) and Meera (Samragyee...,7.3,120 min,Intu Mintu Londonma,71.0,2018
3,"Comedy, Drama, Romance",https://www.imdb.com/title/tt10942220/,Add a Plot,8.6,,Kabaddi Kabaddi Kabaddi,9.0,2019
4,"Drama, Sport",https://www.imdb.com/title/tt8387542/,Father's dedication to guiding his son to beco...,3.2,,Captain,46.0,2019


In [5]:
data.shape

(800, 8)

In [6]:
data.isnull().sum()

genre        77
imdb_url      0
plot          0
rating      541
runtime     416
title         0
votes       541
year          3
dtype: int64

## Pre-process Columns

In [7]:
df = data[['plot', 'genre']].dropna()

In [8]:
df.head()

Unnamed: 0,plot,genre
0,Chhakka Panja 2 continues with new story of Ra...,"Comedy, Drama"
1,When she learns about the worst condition of t...,Comedy
2,When Ishan (Dhiraj Magar) and Meera (Samragyee...,Romance
3,Add a Plot,"Comedy, Drama, Romance"
4,Father's dedication to guiding his son to beco...,"Drama, Sport"


In [9]:
df.shape

(723, 2)

In [10]:
df.isnull().sum()

plot     0
genre    0
dtype: int64

#### Remove data without any plot

In [11]:
df[df['plot'] == 'Add a Plot'].shape

(292, 2)

In [12]:
df = df[~df['plot'].str.contains("Add a Plot")]

In [13]:
df.shape

(431, 2)

In [14]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', ' <url> ', text)
    text = re.sub(r'#+', ' <hashtag> ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' <user> ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)

    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"don't", "do not ", text)
    text = re.sub(r"did't", "did not ", text)
    text = re.sub(r"shouldn't", "should not ", text)
    text = re.sub(r"wouldn't", "would not ", text)
    text = re.sub(r"hadn't", "had not ", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)

    text = re.sub(r"dont", " do not", text)
    text = re.sub(r"didnt", " did not", text)
    text = re.sub(r"wont", " will not", text)
    text = re.sub(r"cant", " can not", text)

    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' <number> ', text)
    text = re.sub('\s+url\s+', ' <url> ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [15]:
df['plot_clean'] = df['plot'].apply(process_text)

In [16]:
df.head()

Unnamed: 0,plot,genre,plot_clean
0,Chhakka Panja 2 continues with new story of Ra...,"Comedy, Drama",chhakka panja <number> continues with new stor...
1,When she learns about the worst condition of t...,Comedy,when she learns about the worst condition of t...
2,When Ishan (Dhiraj Magar) and Meera (Samragyee...,Romance,when ishan dhiraj magar and meera samragyee ra...
4,Father's dedication to guiding his son to beco...,"Drama, Sport",father is dedication to guiding his son to bec...
5,A young mother waiting for her husband to be b...,Drama,a young mother waiting for her husband to be b...


In [17]:
df.shape

(431, 3)

## Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train, test = train_test_split(df, test_size=0.0001)

In [20]:
train.shape, test.shape

((430, 3), (1, 3))

## Extract Features
Use TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(
#     stop_words=stop_words,
    ngram_range=(1, 2),
    stop_words='english'
)

In [23]:
X_train = vectorizer.fit_transform(train['plot_clean'])
X_train.shape

(430, 12025)

In [24]:
X_test = vectorizer.transform(test['plot_clean'])
X_test.shape

(1, 12025)

## Encode Labels
Since the data has multiple labels per movie plot, multi hot binarizer is used.

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

In [26]:
labels = [genre.split(',') for genre in train['genre'].values]
labels[:10]

[['Short', ' Drama', ' Romance'],
 ['Drama', ' Romance'],
 ['Thriller'],
 ['Action'],
 ['Comedy', ' Drama', ' Romance'],
 ['Biography'],
 ['Documentary', ' Short'],
 ['Crime'],
 ['Short', ' Thriller'],
 ['Crime', ' Drama', ' Thriller']]

In [27]:
very_hot = MultiLabelBinarizer()

In [28]:
Y_train = very_hot.fit_transform(labels)

In [29]:
Y_train.shape

(430, 41)

In [30]:
Y_train[:5]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [31]:
Y_test = very_hot.transform([genre.split(',') for genre in test['genre'].values])

In [32]:
Y_test.shape

(1, 41)

In [33]:
very_hot.classes_

array([' Action', ' Adventure', ' Animation', ' Biography', ' Comedy',
       ' Crime', ' Drama', ' Family', ' Fantasy', ' History', ' Horror',
       ' Music', ' Musical', ' Mystery', ' News', ' Reality-TV',
       ' Romance', ' Sci-Fi', ' Short', ' Sport', ' Thriller', ' War',
       ' Western', 'Action', 'Adventure', 'Animation', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Game-Show',
       'History', 'Horror', 'Music', 'Musical', 'Reality-TV', 'Romance',
       'Short', 'Thriller'], dtype=object)

## Create Classifier

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

In [35]:
model = OneVsRestClassifier(MultinomialNB(alpha=0.2))
# model = OneVsRestClassifier(SVC())

In [36]:
model.fit(X_train, Y_train)

OneVsRestClassifier(estimator=MultinomialNB(alpha=0.2, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [37]:
# multilabel_confusion_matrix(Y, Y_pred)

In [38]:
Y_pred = model.predict(X_test)
Y_pred.shape, Y_pred

((1, 41),
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

In [39]:
accuracy_score(Y_test, Y_pred)

0.0

## Test

In [40]:
texts = ["a man stalks a girl after while they fall in love",
         "a man falls in love with a girl from another country when she visits nepal",
         "a documentary about everest"
]
# text = process_text(text)

In [41]:
features = vectorizer.transform(texts)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database',)).History will not be written to the database.


In [42]:
features.shape

(3, 12025)

In [43]:
predictions = model.predict(features)

In [44]:
predictions

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [45]:
indices = predictions == 1
indices.shape

(3, 41)

In [46]:
# np.tile(very_hot.classes_, (len(texts), 1))[indices]

In [47]:
for idx in indices:
    idx = idx.reshape(1, -1)
    print(very_hot.classes_.reshape(1, -1)[idx])

[' Romance' 'Drama']
['Drama']
['Documentary']
