**Importing Important Libarary**

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

**WORKING WITH TRAINED DATA**

In [3]:
# loading the training data set from "train_data.txt"
try:
    with tqdm(total = 50 , desc = "loading traing data")as pbar:
        train_data =  pd.read_csv("train_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Genre","Description"],engine = 'python')
        pbar.update(50)
except:
    print("error in loading traing data")
    raise
    
    

loading traing data:   0%|          | 0/50 [00:00<?, ?it/s]

In [4]:
# some data of the training set
train_data

Unnamed: 0,serial number,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54195,54196,Segregating the Greatest Generation (2006),documentary,"In this ""concept"" essay documentary, 16 parti..."
54196,54197,Vulture Canyon (????),drama,Vulture Canyon is the only juvenile girls pri...
54197,54198,The Burglar (2016),drama,"Alex, a young woman aged 18, lives with her m..."
54198,54199,Dervis i smrt (1974),drama,Ahmet Nurudin is a dervish and head of the Is...


**List of Genre in Data**

In [5]:
Genre_list = [' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',' history ', ' news ', ' biography ', ' romance ', ' game-show ',' musical ', ' war ']

# unkown - Genre meaning "the gnere that model can't predict" 
fallback_genre = "unknown"

**Importing MultiLabelBinarizer  and TfidfVectorizer**

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
x_train = train_data["Description"].astype(str).apply(lambda doc:doc.lower())
label = [genre.split(', ') for genre in train_data["Genre"]]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(label)
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 5000)
with tqdm(total = 50 ,desc = "vectorizing traing data") as pbar:
    x_train_vector = vectorizer.fit_transform(x_train)
    pbar.update(50)


vectorizing traing data:   0%|          | 0/50 [00:00<?, ?it/s]

**Importing Logistic Regression and MultiOutputClassifier**

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
with tqdm(total = 50 ,desc = "Training the Model")as pbar:
    LR = LogisticRegression(max_iter=1000, class_weight='balanced')
    MOC = MultiOutputClassifier(LR)
    MOC.fit(x_train_vector,y_train)
    pbar.update(50)
    

Training the Model:   0%|          | 0/50 [00:00<?, ?it/s]

**WORKING WITH TEST DATA**

In [29]:
# LOADING THE TEST DATA
try:
    with tqdm(total = 50 , desc = "loading Test data")as pbar:
        test_data =  pd.read_csv("test_data.txt",sep= ':::',header = None , names = ["serial number ","Title","Description"],engine='python')
        pbar.update(50)
except:
    print("error in loading test data")
    raise
    

loading Test data:   0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
test_data

Unnamed: 0,serial number,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [13]:
# Data Preprocessing for Test Data
x_test = train_data["Description"].astype(str).apply(lambda doc:doc.lower())

In [14]:
# Transform the Test Data using Progress Bar
with tqdm(total = 50 ,desc = "vectorizing test data") as pbar:
    x_test_vector = vectorizer.transform(x_test)
    pbar.update(50)

vectorizing test data:   0%|          | 0/50 [00:00<?, ?it/s]

**Predicting Genre on Test Data**

In [15]:
with tqdm(total = 50 ,desc = "predicting the Test Data")as pbar:
    y_predict = MOC.predict(x_test_vector)
    pbar.update(50)
    

predicting the Test Data:   0%|          | 0/50 [00:00<?, ?it/s]

In [16]:
# Create the Data Frame for test data with movies name and predicted genre
test_movie_name = test_data["Title"]
predicted_genre = mlb.inverse_transform(y_predict)
test_result = pd.DataFrame({"Title":test_movie_name,"Predicted Genre":predicted_genre})
test_result


Unnamed: 0,Title,Predicted Genre
0,Edgar's Lunch (1998),"( comedy , drama )"
1,La guerra de papá (1977),"( horror , thriller )"
2,Off the Beaten Track (2010),"( adult ,)"
3,Meu Amigo Hindu (2015),"( drama , thriller )"
4,Er nu zhai (1955),"( documentary , short )"
...,...,...
54195,"""Tales of Light & Dark"" (2013)","( documentary , war )"
54196,Der letzte Mohikaner (1965),"( drama ,)"
54197,Oliver Twink (2007),"( drama ,)"
54198,Slipstream (1973),"( drama , history )"


In [17]:
# Replacing unpredicted genre with fallback genre
test_result["Predicted Genre"] = test_result["Predicted Genre"].apply(lambda Genre : [fallback_genre] if len(Genre) == 0 else Genre)
test_result

Unnamed: 0,Title,Predicted Genre
0,Edgar's Lunch (1998),"( comedy , drama )"
1,La guerra de papá (1977),"( horror , thriller )"
2,Off the Beaten Track (2010),"( adult ,)"
3,Meu Amigo Hindu (2015),"( drama , thriller )"
4,Er nu zhai (1955),"( documentary , short )"
...,...,...
54195,"""Tales of Light & Dark"" (2013)","( documentary , war )"
54196,Der letzte Mohikaner (1965),"( drama ,)"
54197,Oliver Twink (2007),"( drama ,)"
54198,Slipstream (1973),"( drama , history )"


**Giving output in Txt Format**

In [18]:
# giving output in txt format:
with open("model_evaluation.txt","w",encoding = "utf-8")as output_file:
    for _, row in test_result.iterrows():
        TITLE = row["Title"]
        GENRE = ', '.join(row["Predicted Genre"])
        output_file.write(f"{TITLE}:::{GENRE}\n")


In [19]:
y_train_pred = MOC.predict(x_train_vector)
y_train_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
accuracy = accuracy_score(y_train,y_train_pred)
precision = precision_score(y_train,y_train_pred,average ="micro")
f1_score = f1_score(y_train,y_train_pred,average = "micro")
recall_score =  recall_score(y_train,y_train_pred ,average = "micro")
print(f"Your model accuracy is {accuracy*100:.2f}%\nYour model precision score is {precision}\nYour model f1_score is {f1_score}\nYour model recall score is {recall_score}")

In [21]:
output = pd.read_csv("model_evaluation.txt",sep= ':::',header = None , names = ["Title","Genre"],engine = 'python')
output

Unnamed: 0,Title,Genre
0,Edgar's Lunch (1998),"comedy , drama"
1,La guerra de papá (1977),"horror , thriller"
2,Off the Beaten Track (2010),adult
3,Meu Amigo Hindu (2015),"drama , thriller"
4,Er nu zhai (1955),"documentary , short"
...,...,...
54195,"""Tales of Light & Dark"" (2013)","documentary , war"
54196,Der letzte Mohikaner (1965),drama
54197,Oliver Twink (2007),drama
54198,Slipstream (1973),"drama , history"
