**Loading saved dataset**

In [117]:
import pickle
import pandas as pd 
movies_new = pd.read_pickle('my_df.pickle')

movies_new.drop(['plot','genre'],axis=1)


Unnamed: 0,movie_id,movie_name,genre_new,clean_plot
0,23890098,Taxi Blues,"[55, 55]",shlykov hard working taxi driver lyosha saxoph...
1,31186339,The Hunger Games,"[11, 26, 11, 55]",nation panem consists wealthy capitol twelve p...
2,20663735,Narasimham,"[35, 11, 55, 35]",poovalli induchoodan sentenced six years priso...
3,2231378,The Lemon Drop Kid,"[67, 67]",lemon drop kid new york city swindler illegall...
4,595909,A Cry in the Dark,"[22, 55, 52, 55, 55]",seventh day adventist church pastor michael ch...
5,5272176,End Game,"[22, 11, 11, 55]",president way give speech traveling man shows ...
6,1952976,Dark Water,"[22, 55, 26]",plot film opens young girl dahlia stands outsi...
7,24225279,Sing,"[55, 26]",story begins hannah young jewish teen completi...
8,2462689,Meet John Doe,"[67, 54, 38, 38, 34, 55, 67, 55]",infuriated told write one final column laid ne...
9,20532852,Destination Meatball,"[65, 65, 65]",line people drool window shop market butcher b...


**Dataset transformation for model fitting**

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])

#Transform target variable 
#One hot encoding for multilabel 
y = multilabel_binarizer.transform(movies_new['genre_new'])

In [0]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, max_features=5000)

In [0]:
xtrain, xval, ytrain, yval = train_test_split(movies_new['clean_plot'], y, test_size=0.2, random_state=9)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [0]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [0]:
lr = LogisticRegression(solver='liblinear')
clf = OneVsRestClassifier(lr)


In [137]:
# Fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

**Evaluation metrics**

In [138]:
#Evaluation metric:
y_pred = clf.predict(xval_tfidf)
print('Logistic Regression score: ',f1_score(yval, y_pred, average="micro"))

y_pred_prob = clf.predict_proba(xval_tfidf)
t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
print('Logistic Regression score with thershold 0.3: ',f1_score(yval, y_pred_new, average="micro"))

Logistic Regression score:  0.529327981354396
Logistic Regression score with thershold 0.3:  0.6035600795721704


In [116]:
from sklearn.metrics import hamming_loss
print('Hamming loss',hamming_loss(yval,y_pred))

Hamming loss 0.025671519953836302


In [0]:
import re

In [0]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [0]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#Function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)


**Affine clustering prediction**

In [0]:
aux=pd.read_pickle('Clusterform.pickle')

In [75]:
#Predicting and displaying tags for first 100 datasets in validation set
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)
  
  
for i in range(10): 
  k = xval.sample(1).index[0] 
  print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k])
  for j in movies_new['genre_new'][k]:
    list2=[]
    for i in range(aux.shape[0]):
      if str(aux[0][i]) == j:
        list2.append(aux.index[i])
    print("Possible genre for tag ",j," : ",list2)
  print('\n')  

Movie:  Sathi 
Predicted genre:  [('69',)]
Actual genre:  ['69', '13']
Possible genre for tag  69  :  ['kafkaesque', 'dystopia', 'cyberpunk', 'future noir']
Possible genre for tag  13  :  ['culture & society', 'documentary', 'social issues', 'gender issues', 'law & crime']


Movie:  Badla Jatti Da 
Predicted genre:  [('13', '69')]
Actual genre:  ['69', '4', '55']
Possible genre for tag  69  :  ['kafkaesque', 'dystopia', 'cyberpunk', 'future noir']
Possible genre for tag  4  :  ['gay themed', 'lgbt', 'gay interest', 'gay']
Possible genre for tag  55  :  ['pornographic movie', 'erotic drama', 'pinku eiga', 'erotica']


Movie:  Achanak 
Predicted genre:  [('13', '69')]
Actual genre:  ['13']
Possible genre for tag  13  :  ['culture & society', 'documentary', 'social issues', 'gender issues', 'law & crime']


Movie:  Descent 
Predicted genre:  [()]
Actual genre:  ['71', '22', '55', '4', '51']
Possible genre for tag  71  :  ['detective fiction', 'detective']
Possible genre for tag  22  :  ['

**KMeans Clustering predictions**

In [0]:
with open('parrot2.pkl', 'rb') as f:
  
  genlabel = pickle.load(f)

In [97]:
label,cluster = zip(*genlabel)
label=list(label)
print(label)
cluster=list(cluster)

print(len(cluster))
print(label[17])

['b-western', 'stand-up comedy', 'natural disaster', 'boxing', 'crime', 'sword and sorcery films', 'period piece', 'horse racing', 'television movie', 'comedy-drama', 'gay themed', 'chase movie', 'pornographic movie', 'sponsored film', 'neorealism', 'whodunit', 'ninja movie', 'roadshow theatrical release', 'romantic thriller', 'costume adventure', 'gross out', 'feature film', 'plague', 'slasher', 'glamorized spy film', 'film & television history', 'road-horror', 'stop motion', 'werewolf fiction', 'news', 'tragedy', "children's fantasy", 'action', 'superhero movie', 'tokusatsu', 'culture & society', 'sword and sorcery', 'parody', 'indian western', 'sex comedy', 'media studies', 'religious film', 'star vehicle', 'sci-fi horror', 'essay film', 'bloopers & candid camera', 'adventure comedy', 'alien film', "children's entertainment", 'swashbuckler films', 'silhouette animation', 'kafkaesque', 'giallo', 'hagiography', "children's issues", 'graphic & applied arts', 'biker film', 'british new 

In [98]:
#Predicting and displaying tags for first 100 datasets in validation set
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)
  
  
for i in range(10): 
  k = xval.sample(1).index[0] 
  print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k])
  for j in movies_new['genre_new'][k]:
    list2=[]
    for k in range(363):
      if cluster[k] == j:
        list2.append(label[k])
    print("Possible genre for tag ",j," : ",list2)
  print('\n')  

Movie:  Operation C.I.A. 
Predicted genre:  [(7, 10, 11)]
Actual genre:  [11, 2, 10, 7]
Possible genre for tag  11  :  ['road-horror', 'werewolf fiction', 'action', 'superhero movie', 'tokusatsu', 'sword and sorcery', 'sex comedy', 'concert film', 'ealing comedies', 'history', 'sci fi pictures original films', 'health & fitness', 'childhood drama', 'steampunk', 'coming of age', 'revenge', 'buddy film', 'instrumental music', 'comedy western', 'black-and-white', 'computer animation', 'filipino movies', 'archives and records', 'patriotic film', 'anti-war film', 'slapstick', 'fictional film', 'hardcore pornography', 'coming-of-age film']
Possible genre for tag  2  :  ['media studies', 'religious film', 'star vehicle', 'sci-fi horror', 'essay film', 'bloopers & candid camera', 'alien film', "children's entertainment", 'silhouette animation', 'kafkaesque', 'hagiography', "children's issues", 'biker film', 'superhero', 'outlaw', 'christian film', 'screwball comedy', 'biographical film', 'holi

**Basic model**

In [107]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)
  
  
for i in range(5): 
  k = xval.sample(1).index[0] 
  print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k], "\n")  

Movie:  Aao Pyaar Karen 
Predicted genre:  [('drama', 'romance film', 'world cinema')]
Actual genre:  ['romance film', 'drama'] 

Movie:  Desperate Lives 
Predicted genre:  [('drama',)]
Actual genre:  ['drama'] 

Movie:  Twitches Too! 
Predicted genre:  [('comedy',)]
Actual genre:  ['family film', 'comedy', 'teen', 'television movie'] 

Movie:  Washington Square 
Predicted genre:  [('drama', 'romance film')]
Actual genre:  ['romantic drama', 'period piece', 'drama', 'romance film', 'film adaptation'] 

Movie:  Saving Shiloh 
Predicted genre:  [()]
Actual genre:  ["children's/family", 'family drama', 'childhood drama', 'drama', 'coming of age', 'family film'] 

