In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
train_data = pd.read_csv("train_data.txt", sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
train_data.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [2]:
test_data = pd.read_csv("test_data.txt", sep=':::', names=['Title', 'Description'], engine='python')
test_data.head()

Unnamed: 0,Title,Description
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...


In [3]:
solution=pd.read_csv("test_data_solution.txt", sep=':::', names=['Title','Genre', 'Description'], engine='python')
solution

Unnamed: 0,Title,Genre,Description
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...
54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [4]:
X_train = train_data['Description'].values
y_train = train_data['Genre'].values
X_test = test_data['Description'].values
y_test = solution['Genre'].values

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [7]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train_encoded)
y_pred_nb = nb_model.predict(X_test_tfidf)

In [8]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_tfidf, y_train_encoded)
y_pred_lr = lr_model.predict(X_test_tfidf)

In [9]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train_encoded)
y_pred_svm = svm_model.predict(X_test_tfidf)

In [20]:
# Evaluate Naive Bayes
print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_nb)}")
print(classification_report(y_test_encoded, y_pred_nb, target_names=label_encoder.classes_))

Naive Bayes Classifier:
Accuracy: 0.5238560885608856
               precision    recall  f1-score   support

      action        0.55      0.11      0.18      1314
       adult        0.51      0.06      0.11       590
   adventure        0.81      0.07      0.13       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.51      0.42      0.46      7446
       crime        0.00      0.00      0.00       505
 documentary        0.57      0.87      0.69     13096
       drama        0.46      0.82      0.59     13612
      family        0.50      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.98      0.32      0.48       193
     history        0.00      0.00      0.00       243
      horror        0.69      0.36      0.47      2204
       music        0.74      0.15      0.25       731
     musical        0.00      0.00      0.00       276
     myster

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# Evaluate Logistic Regression
print("Logistic Regression Classifier:")
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_lr)}")
print(classification_report(y_test_encoded, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Classifier:
Accuracy: 0.5840590405904059
               precision    recall  f1-score   support

      action        0.48      0.29      0.36      1314
       adult        0.60      0.24      0.34       590
   adventure        0.59      0.17      0.26       775
   animation        0.53      0.07      0.12       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.58      0.55      7446
       crime        0.37      0.04      0.08       505
 documentary        0.67      0.85      0.75     13096
       drama        0.54      0.77      0.64     13612
      family        0.50      0.09      0.15       783
     fantasy        0.56      0.06      0.10       322
   game-show        0.92      0.51      0.65       193
     history        0.00      0.00      0.00       243
      horror        0.64      0.57      0.60      2204
       music        0.67      0.45      0.54       731
     musical        0.33      0.02      0.04       276
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Evaluate SVM
print("Support Vector Machine Classifier:")
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_svm)}")
print(classification_report(y_test_encoded, y_pred_svm, target_names=label_encoder.classes_))

Support Vector Machine Classifier:
Accuracy: 0.5856273062730627
               precision    recall  f1-score   support

      action        0.42      0.35      0.38      1314
       adult        0.57      0.36      0.44       590
   adventure        0.50      0.21      0.30       775
   animation        0.42      0.13      0.19       498
   biography        0.00      0.00      0.00       264
      comedy        0.52      0.58      0.55      7446
       crime        0.26      0.05      0.09       505
 documentary        0.68      0.84      0.75     13096
       drama        0.55      0.75      0.64     13612
      family        0.44      0.10      0.16       783
     fantasy        0.44      0.11      0.17       322
   game-show        0.83      0.61      0.70       193
     history        0.00      0.00      0.00       243
      horror        0.64      0.59      0.62      2204
       music        0.66      0.49      0.56       731
     musical        0.35      0.05      0.08       276


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
