In [81]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# WORKING WITH TRAINED DATA

In [82]:
# loading the training data set from "train_data.txt"
try:
    with tqdm(total = 50 , desc = "loading traing data")as pbar:
        train_data =  pd.read_csv("train_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Genre","Description"])
        pbar.update(50)
except:
    print("error in loading traing data")
    raise
    
    

  train_data =  pd.read_csv("train_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Genre","Description"])
loading traing data: 100%|██████████| 50/50 [00:02<00:00, 19.08it/s]


In [83]:
# some data of the training set
train_data

Unnamed: 0,serial number,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54195,54196,Segregating the Greatest Generation (2006),documentary,"In this ""concept"" essay documentary, 16 parti..."
54196,54197,Vulture Canyon (????),drama,Vulture Canyon is the only juvenile girls pri...
54197,54198,The Burglar (2016),drama,"Alex, a young woman aged 18, lives with her m..."
54198,54199,Dervis i smrt (1974),drama,Ahmet Nurudin is a dervish and head of the Is...


In [84]:
# list of Genre list  in data
Genre_list = [' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',' history ', ' news ', ' biography ', ' romance ', ' game-show ',' musical ', ' war ']

# unkown - Genre meaning "the gnere that model can't predict" 
fallback_genre = "unknown"

In [85]:
# importing MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
x_train = train_data["Description"].astype(str).apply(lambda doc:doc.lower())
label = [genre.split(', ') for genre in train_data["Genre"]]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(label)
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [86]:
# vectorization using TfidfVectorizer and fitting and transforming the data with progress bar.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 5000)
with tqdm(total = 50 ,desc = "vectorizing traing data") as pbar:
    x_train_vector = vectorizer.fit_transform(x_train)
    pbar.update(50)


vectorizing traing data: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]


In [87]:
# importing navie_bayes and MultiOutputClassifier and traing the multioutput data.
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
with tqdm(total = 50 ,desc = "Training the Model")as pbar:
    NB = MultinomialNB()
    MOC = MultiOutputClassifier(NB)
    MOC.fit(x_train_vector,y_train)
    pbar.update(50)
    

Training the Model: 100%|██████████| 50/50 [00:00<00:00, 60.16it/s]


# WORKING WITH TEST DATA

In [88]:
# LOADING THE TEST DATA
try:
    with tqdm(total = 50 , desc = "loading Test data")as pbar:
        test_data =  pd.read_csv("test_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Description"])
        pbar.update(50)
except:
    print("error in loading test data")
    raise
    

  test_data =  pd.read_csv("test_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Description"])
loading Test data: 100%|██████████| 50/50 [00:00<00:00, 100.77it/s]


In [89]:
test_data

Unnamed: 0,serial number,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [90]:
# Data Preprocessing for Test Data
x_test = train_data["Description"].astype(str).apply(lambda doc:doc.lower())

In [91]:
# Transform the Test Data using Progress Bar
with tqdm(total = 50 ,desc = "vectorizing test data") as pbar:
    x_test_vector = vectorizer.transform(x_test)
    pbar.update(50)

vectorizing test data: 100%|██████████| 50/50 [00:05<00:00,  9.07it/s]


In [92]:
# Predicting Genre on Test Data
with tqdm(total = 50 ,desc = "predicting the Test Data")as pbar:
    y_predict = MOC.predict(x_test_vector)
    pbar.update(50)
    

predicting the Test Data: 100%|██████████| 50/50 [00:00<00:00, 71.34it/s]


In [93]:
# Create the Data Frame for test data with movies name and predicted genre
test_movie_name = test_data["Title"]
predicted_genre = mlb.inverse_transform(y_predict)
test_result = pd.DataFrame({"Title":test_movie_name,"Predicted Genre":predicted_genre})
test_result


Unnamed: 0,Title,Predicted Genre
0,Edgar's Lunch (1998),()
1,La guerra de papá (1977),()
2,Off the Beaten Track (2010),()
3,Meu Amigo Hindu (2015),"( drama ,)"
4,Er nu zhai (1955),"( documentary ,)"
...,...,...
54195,"""Tales of Light & Dark"" (2013)","( documentary ,)"
54196,Der letzte Mohikaner (1965),()
54197,Oliver Twink (2007),"( drama ,)"
54198,Slipstream (1973),()


In [94]:
# Replacing unpredicted genre with fallback genre
test_result["Predicted Genre"] = test_result["Predicted Genre"].apply(lambda Genre : [fallback_genre] if len(Genre) == 0 else Genre)
test_result

Unnamed: 0,Title,Predicted Genre
0,Edgar's Lunch (1998),[unknown]
1,La guerra de papá (1977),[unknown]
2,Off the Beaten Track (2010),[unknown]
3,Meu Amigo Hindu (2015),"( drama ,)"
4,Er nu zhai (1955),"( documentary ,)"
...,...,...
54195,"""Tales of Light & Dark"" (2013)","( documentary ,)"
54196,Der letzte Mohikaner (1965),[unknown]
54197,Oliver Twink (2007),"( drama ,)"
54198,Slipstream (1973),[unknown]


In [95]:
# giving output in txt format:
with open("model_evaluation.txt","w",encoding = "utf-8")as output_file:
    for _, row in test_result.iterrows():
        TITLE = row["Title"]
        GENRE = ', '.join(row["Predicted Genre"])
        output_file.write(f"{TITLE}:::{GENRE}\n")


In [96]:
y_train_pred = MOC.predict(x_train_vector)
y_train_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [97]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
accuracy = accuracy_score(y_train,y_train_pred)
precision = precision_score(y_train,y_train_pred,average ="micro")
f1_score = f1_score(y_train,y_train_pred,average = "micro")
recall_score =  recall_score(y_train,y_train_pred ,average = "micro")
print(f"Your model accuracy is {accuracy*100:.2f}%\nYour model precision score is {precision}\nYour model f1_score is {f1_score}\nYour model recall score is {recall_score} ")

Your model accuracy is 27.74%
Your model precision score is 0.7216450832503202
Your model f1_score is 0.4041657257475326
Your model recall score is 0.2806826568265683 
