# **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, LancasterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
import re

# **Loading** **data**

In [None]:
columns = [ 'Title' , 'Genre' , 'Description']
column = ['Title' , 'Description']

In [None]:
train_set= pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt',delimiter=':::',engine='python', names=columns,index_col=0 )

In [None]:
test_set=pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt', delimiter=':::', engine='python',names=columns, index_col=0 )

# **Data** **Preprocessing**

In [None]:
train_set.loc[:,'Description'] = train_set['Description'].astype(str).str.lower()
test_set.loc[:,'Description'] = test_set['Description'].astype(str).str.lower()

In [None]:
train_set.drop(columns=['Title'], inplace=True)
test_set.drop(columns=['Title'], inplace=True)

In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
train_set.info()

In [None]:
train_set.describe()

In [None]:
test_set.info()

In [None]:
test_set.describe()

In [None]:
print(train_set.duplicated().sum())
print(test_set.duplicated().sum())

In [None]:
train_set=train_set.drop_duplicates()
test_set=test_set.drop_duplicates()

In [None]:
print(train_set.shape)
print(test_set.shape)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Genre' , data=train_set , order=train_set['Genre'].value_counts().index )
plt.title('Genre Distribution')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
x_train=train_set['Description']
y_train=train_set['Genre']
x_test=test_set['Description']
y_test=test_set['Genre']

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
print(x_test.shape)
print(y_test.shape)

# **Feature Engineering**

In [None]:
def cleaning_data(text):
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z+]', ' ', text)
    text = "".join([i for i in text if i not in string.punctuation])
    text = re.sub(r"\s+", " ", text).strip()
    return text
x_train=x_train.apply(cleaning_data)
x_test=x_test.apply(cleaning_data)

In [None]:
tfidf=TfidfVectorizer( stop_words ='english',max_features=5000)
x_train=tfidf.fit_transform(x_train)
x_test=tfidf.transform(x_test)

# **splitting** **the** **dataset**

In [None]:
X_train,X_val,Y_train,Y_val=train_test_split(x_train,y_train,test_size=0.2,random_state=42)

# **Model Training**

In [None]:
nb=MultinomialNB()
nb.fit(X_train, Y_train)

# **Model Evaluation**

In [None]:
y_pred=nb.predict(X_val)

In [None]:
accuracy_train=accuracy_score(Y_val, y_pred)
print("Train accuracy:", accuracy_train)
print("Train classification report:")
print(classification_report(Y_val, y_pred,zero_division=0))

In [None]:
y_pred_test=nb.predict(x_test)
accuracy_test=accuracy_score(y_test, y_pred_test)
print("\nTest accuracy:", accuracy_test)
print("Test classification report:")
print(classification_report(y_test, y_pred_test,zero_division=0))