# MOVIE GENRE CLASSIFICATION

 Create a machine learning model that can predict the genre of a
 movie based on its plot summary or other textual information. You
 can use techniques like TF-IDF or word embeddings with classifiers
 such as Naive Bayes, Logistic Regression, or Support Vector
 Machines.

In [1]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score

In [2]:
## train data
df_train = pd.read_csv("train_data.txt", sep=':::',header=None )
df_train.columns=['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
df_train.dropna(subset=['DESCRIPTION', 'GENRE'], inplace=True)

df_test = pd.read_csv("test_data_solution.txt", sep=':::',header=None )
df_test.columns=['ID', 'TITLE', 'GENRE' , 'DESCRIPTION']


  df_train = pd.read_csv("train_data.txt", sep=':::',header=None )
  df_test = pd.read_csv("test_data_solution.txt", sep=':::',header=None )


In [3]:
## Feature Selection

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf.fit_transform(df_train['DESCRIPTION'])

# Target labels
y_train = df_train['GENRE']


X_test = tfidf.transform(df_test['DESCRIPTION'])
y_test=df_test['GENRE']


In [None]:
## Model training and metric evaluation

models={
    "Logistic-Regression" : LogisticRegression(max_iter=100),
    "Naive-Bayes" : MultinomialNB(),
    "Support-Vector-Machine" : SVC(kernel='linear')
}


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    
    
    ## Make Prediction
    y_predict_train=model.predict(X_train)
    y_predict_test=model.predict(X_test)
    
    
    ## Train set performance
    y_predict_train_accuracy=accuracy_score(y_train,y_predict_train)
    y_predict_train_f1_score=f1_score(y_train,y_predict_train,average='weighted',zero_division=0)
    y_predict_train_precision=precision_score(y_train,y_predict_train,average='weighted',zero_division=0)
    y_predict_train_recall=recall_score(y_train,y_predict_train,average='weighted',zero_division=0)
    
    ## Test set performance
    y_predict_test_accuracy=accuracy_score(y_test,y_predict_test)
    y_predict_test_f1_score=f1_score(y_test,y_predict_test,average='weighted')
    y_predict_test_precision=precision_score(y_test,y_predict_test,average='weighted')
    y_predict_test_recall=recall_score(y_test,y_predict_test,average='weighted')
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(y_predict_train_accuracy))
    print('- F1 score: {:.4f}'.format(y_predict_train_f1_score))
    
    print('- Precision: {:.4f}'.format(y_predict_train_precision))
    print('- Recall: {:.4f}'.format(y_predict_train_recall))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(y_predict_test_accuracy))
    print('- F1 score: {:.4f}'.format(y_predict_test_f1_score))
    print('- Precision: {:.4f}'.format(y_predict_test_precision))
    print('- Recall: {:.4f}'.format(y_predict_test_recall))

    
    print('='*35)
    print('\n')
    
    


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Logistic-Regression
Model performance for Training set
- Accuracy: 0.6610
- F1 score: 0.6287
- Precision: 0.6597
- Recall: 0.6610
----------------------------------
Model performance for Test set
- Accuracy: 0.5842
- F1 score: 0.5457
- Precision: 0.5643
- Recall: 0.5842




  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Naive-Bayes
Model performance for Training set
- Accuracy: 0.5528
- F1 score: 0.4837
- Precision: 0.5799
- Recall: 0.5528
----------------------------------
Model performance for Test set
- Accuracy: 0.5239
- F1 score: 0.4484
- Precision: 0.5080
- Recall: 0.5239


