**importing libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

**reading data and EDA**

In [2]:
train_data = pd.read_csv('train_data.txt',sep=':::',names=['ID','TITLE','GENRE','DESCRIPTION'],engine='python')

FileNotFoundError: [Errno 2] No such file or directory: 'train_data.txt'

In [None]:
train_data

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.isnull().sum()

In [None]:
test_data = pd.read_csv('test_data.txt',sep=':::',names=['ID','TITLE','DESCRIPTION'],engine='python')

In [None]:
test_data

In [None]:
test_data.info()

In [None]:
test_data.describe()

In [None]:
test_data.isnull().sum()

**Visualizing the distribution of genre**

In [None]:
plt.figure(figsize=(14,10))
val_counts=train_data['GENRE'].value_counts()
sns.barplot(x=val_counts.index,y=val_counts,color='lightgreen')
plt.xlabel("GENRE",fontweight='bold')
plt.ylabel("COUNT",fontweight='bold')
plt.title("distribution of genre feature",fontweight='bold')
plt.xticks(rotation=90,fontweight='bold');

In [None]:
plt.figure(figsize=(14,10))
sns.countplot(data=train_data,y='GENRE',order=train_data['GENRE'].value_counts().index,palette='viridis')
plt.xlabel('COUNT',fontweight='bold')
plt.ylabel('GENRE',fontweight='bold')

In [None]:
stemmer = LancasterStemmer()
stopwords=set(stopwords.words('english'))

**data preprocessing**

In [None]:
def text_cleaning(data):
    data=data.lower()
    data = re.sub(r'@\S+', '', data)
    data = re.sub(r'http\S+', '', data)
    data = re.sub(r'pic.\S+', '', data)
    data = re.sub(r"[^a-zA-Z+']", ' ', data)
    data =re.sub(r'\s+[a-zA-Z]\s+', ' ', data + ' ')
    data = "".join([i for i in data if i not in string.punctuation])
    words = nltk.word_tokenize(data)
    data = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    data = re.sub("\s[\s]+", " ", data).strip()
    return data

In [None]:
train_data['Cleaned_Des']=train_data['DESCRIPTION'].apply(text_cleaning)
test_data['Cleaned_Des']=test_data['DESCRIPTION'].apply(text_cleaning)

In [None]:
train_data

In [None]:
print("shape before droping duplicates",train_data.shape)
train_data = train_data.drop_duplicates()
print("shape after droping duplicates",train_data.shape)

**Visualizing the preprocessed description and its length**

In [None]:
train_data['length_CleanDes'] = train_data['Cleaned_Des'].apply(len)
plt.figure(figsize=(14, 10))
sns.histplot(data=train_data, x='length_CleanDes', bins=20, kde=True, color='lightgreen')
plt.xlabel('Length',fontweight='bold')
plt.ylabel('Frequency',fontweight='bold')
plt.title('Distribution of Lengths',fontweight='bold')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
original_lengths = train_data['DESCRIPTION'].apply(len)
plt.hist(original_lengths, bins=range(0, max(original_lengths) + 100, 100), color='blue', alpha=0.7)
plt.title('Original Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
cleaned_lengths=train_data['Cleaned_Des'].apply(len)
plt.hist(cleaned_lengths, bins=range(0, max(cleaned_lengths) + 100, 100), color='lightgreen', alpha=0.7)
plt.title('Cleaned Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

**Encoding the target genre and splitting the data before word embeddings**

In [None]:
label_encoder = LabelEncoder()
train_data['Genre_encoded'] = label_encoder.fit_transform(train_data['GENRE'])
class_names= list(label_encoder.classes_)
class_names

In [None]:
train_data['Genre_encoded']

In [None]:
x = train_data['Cleaned_Des']
y = train_data['Genre_encoded']
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.2,random_state=42)

 # using TFIDF Vectorizer

**using three different classifiers for this method**

In [None]:
tfidf = TfidfVectorizer()
x_tf_train = tfidf.fit_transform(x_train)
x_tf_test = tfidf.transform(test_data['Cleaned_Des'])
x_tf_val = tfidf.transform(x_val)

In [None]:
classifier1=MultinomialNB()
classifier1.fit(x_tf_train,y_train)

In [None]:
y_pred = classifier1.predict(x_tf_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

In [None]:
classifier2 = LogisticRegression()
classifier2.fit(x_tf_train,y_train)

In [None]:
y_pred1 = classifier2.predict(x_tf_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred1)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred1,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred1)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

In [None]:
classifier3 = LinearSVC()
classifier3.fit(x_tf_train,y_train)

In [None]:
y_pred2 = classifier3.predict(x_tf_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred2)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred2,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred2)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

# using CountVectorizer

**using three different classifiers for this method**

In [None]:
cv = CountVectorizer()
x_cv_train = cv.fit_transform(x_train)
x_cv_test =  cv.transform(test_data['Cleaned_Des'])
x_cv_val = cv.transform(x_val)

In [None]:
classifier4=MultinomialNB()
classifier4.fit(x_cv_train,y_train)

In [None]:
y_pred = classifier4.predict(x_cv_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

In [None]:
classifier5 = LogisticRegression()
classifier5.fit(x_cv_train,y_train)

In [None]:
y_pred1 = classifier5.predict(x_cv_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred1)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred1,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred1)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

In [None]:
classifier6 = LinearSVC()
classifier6.fit(x_cv_train,y_train)

In [None]:
y_pred2 = classifier6.predict(x_cv_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred2)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred2,zero_division=1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred2)
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, fmt='d', cbar=False, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()