IMPORTING LIBRARIES

In [23]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectFromModel


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LOADING DATASET

In [24]:
train_data = pd.read_csv("C:/Users/User/Downloads/Genre Classification Dataset/train_data.txt", sep = ":::", engine = "python", names = ["ID","Title","Genre","Description"])
test_data = pd.read_csv("C:/Users/User/Downloads/Genre Classification Dataset/test_data.txt", sep = ":::", engine = "python", names =["ID","Title","Description"])
train_data.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


CHECKING FOR MISSING DATA

In [25]:
train_data.isnull().sum()
test_data.isnull().sum()
train_data.duplicated().sum()
test_data.duplicated().sum()

np.int64(0)

MAKING GENRES OF THE SAME CHARACTER

In [26]:
train_data["Gnere"] = train_data["Genre"].str.strip().str.lower()
print("Gneres after cleaning: ",train_data["Genre"].unique)

Gneres after cleaning:  <bound method Series.unique of 0               drama 
1            thriller 
2               adult 
3               drama 
4               drama 
             ...      
54209          comedy 
54210          horror 
54211     documentary 
54212          comedy 
54213         history 
Name: Genre, Length: 54214, dtype: object>


CLEANING THE DATASET

In [27]:
stop_words = set(stopwords.words("english"))
def cleaned_set(text):
    if isinstance(text,str):
        text = text.lower()
        text = re.sub(r"\W"," ",text)
        text = re.sub(r"\s+"," ",text).strip()
        text = re.sub(r"\d+"," ",text)
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return " ".join(words)
    return ""
train_data["new_Description"] = train_data["Description"].apply(cleaned_set)
test_data["new_Description"] = test_data["Description"].apply(cleaned_set)

In [28]:
print(train_data.shape)
print(test_data.shape)
print(train_data[["Description","new_Description"]].head())

(54214, 6)
(54200, 4)
                                         Description  \
0   Listening in to a conversation between his do...   
1   A brother and sister with a past incestuous r...   
2   As the bus empties the students for their fie...   
3   To help their unemployed father make ends mee...   
4   The film's title refers not only to the un-re...   

                                     new_Description  
0  listening conversation doctor parents year old...  
1  brother sister past incestuous relationship cu...  
2  bus empties students field trip museum natural...  
3  help unemployed father make ends meet edith tw...  
4  film title refers un recovered bodies ground z...  


In [29]:
print(test_data)

          ID                             Title  \
0          1             Edgar's Lunch (1998)    
1          2         La guerra de papá (1977)    
2          3      Off the Beaten Track (2010)    
3          4           Meu Amigo Hindu (2015)    
4          5                Er nu zhai (1955)    
...      ...                               ...   
54195  54196   "Tales of Light & Dark" (2013)    
54196  54197      Der letzte Mohikaner (1965)    
54197  54198              Oliver Twink (2007)    
54198  54199                Slipstream (1973)    
54199  54200        Curitiba Zero Grau (2010)    

                                             Description  \
0       L.R. Brane loves his life - his car, his apar...   
1       Spain, March 1964: Quico is a very naughty ch...   
2       One year in the life of Albin and his family ...   
3       His father has died, he hasn't spoken with hi...   
4       Before he was known internationally as a mart...   
...                                    

ENCODING GENRE LABEL

In [30]:
train_data['Genre'] = train_data['Genre'].str.strip().str.lower()
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_data["Genre"])
print(y_train[:10])
genre_position = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(genre_position)

[ 8 24  1  8  8  7  5  6 18 13]
{'action': np.int64(0), 'adult': np.int64(1), 'adventure': np.int64(2), 'animation': np.int64(3), 'biography': np.int64(4), 'comedy': np.int64(5), 'crime': np.int64(6), 'documentary': np.int64(7), 'drama': np.int64(8), 'family': np.int64(9), 'fantasy': np.int64(10), 'game-show': np.int64(11), 'history': np.int64(12), 'horror': np.int64(13), 'music': np.int64(14), 'musical': np.int64(15), 'mystery': np.int64(16), 'news': np.int64(17), 'reality-tv': np.int64(18), 'romance': np.int64(19), 'sci-fi': np.int64(20), 'short': np.int64(21), 'sport': np.int64(22), 'talk-show': np.int64(23), 'thriller': np.int64(24), 'war': np.int64(25), 'western': np.int64(26)}


SPLITTING THE DATASET

In [31]:
X = train_data["new_Description"]
y = y_train
X_train_split,X_val,y_train_split,y_val = train_test_split(X,y,train_size = 0.8, test_size = 0.2,random_state =42)
print("Training set shape: ", len(X_train_split),len(y_train_split))
print("Validation test shape: ", len(X_val),len(y_val))
print("Test shape given to us: ",test_data.shape)

Training set shape:  43371 43371
Validation test shape:  10843 10843
Test shape given to us:  (54200, 4)


FEATURE SCALING

In [32]:
ftf = TfidfVectorizer(
token_pattern = r"(?u)\b\w+\b",
lowercase = True,
decode_error= 'ignore',
max_features = 40000,
ngram_range = (1,3),
use_idf = True,
smooth_idf = True,
sublinear_tf = True,

min_df = 1,
max_df = 1.0
)

X_train_trans = ftf.fit_transform(X_train_split)
X_val_trans = ftf.transform(X_val)
X_test_trans = ftf.transform(test_data["new_Description"])



In [33]:
print(X_test_trans.shape)

(54200, 40000)


In [34]:
print("Train tfidf shape: ", X_train_trans.shape)
print("Validation tfidf shape: ", X_val_trans.shape)
print("Test tfidf shape: ", X_test_trans.shape)

Train tfidf shape:  (43371, 40000)
Validation tfidf shape:  (10843, 40000)
Test tfidf shape:  (54200, 40000)


In [35]:
print(test_data["new_Description"].isnull().sum())
print("Sample Features: ", ftf.get_feature_names_out()[:20])

0
Sample Features:  ['_' '_ qv' '_ qv _the' '_the' 'aa' 'aamir' 'aarhus' 'aaron' 'aarti'
 'aatrayee' 'ab' 'aba' 'abandon' 'abandoned' 'abandoned baby'
 'abandoned building' 'abandoned child' 'abandoned children'
 'abandoned family' 'abandoned house']


In [36]:
classes = np.unique(y_train_split)
class_weights = compute_class_weight('balanced',classes = classes, y=y_train_split)
class_weights_dict = dict(zip(classes, class_weights))

LOGISTIC REGRESSION MODEL

In [37]:
model = LogisticRegression(
    max_iter= 15000,  
    random_state= 42,
    tol = 1e-2,
    intercept_scaling = 1,
    C=17.0,
    penalty  = 'l2',
    solver = 'liblinear',
    class_weight= class_weights_dict,
)

model.fit(X_train_trans, y_train_split)  
y_prediction = model.predict(X_val_trans) 

accuracy = accuracy_score(y_val, y_prediction)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.6014


In [38]:
print("Classification Report: ", classification_report(y_val,y_prediction))

Classification Report:                precision    recall  f1-score   support

           0       0.41      0.43      0.42       263
           1       0.66      0.60      0.63       112
           2       0.33      0.29      0.31       139
           3       0.34      0.20      0.25       104
           4       0.12      0.02      0.03        61
           5       0.57      0.63      0.60      1443
           6       0.25      0.17      0.20       107
           7       0.75      0.79      0.77      2659
           8       0.65      0.65      0.65      2697
           9       0.31      0.26      0.28       150
          10       0.27      0.09      0.14        74
          11       0.82      0.70      0.76        40
          12       0.60      0.07      0.12        45
          13       0.62      0.71      0.66       431
          14       0.53      0.68      0.60       144
          15       0.22      0.10      0.14        50
          16       0.27      0.11      0.15        56
   

TEST SET EVALUATION

In [39]:
print(test_data.columns)

Index(['ID', 'Title', 'Description', 'new_Description'], dtype='object')


TRANSFORMING TEST DATA

In [40]:
X_test_tfidf  = ftf.transform(test_data['Description'])
y_test_pred = model.predict(X_test_trans)
y_test_pred_labels = encoder.inverse_transform(y_test_pred)
print(y_test_pred_labels[:20])


['comedy' 'drama' 'documentary' 'drama' 'drama' 'adult' 'drama' 'comedy'
 'documentary' 'drama' 'drama' 'comedy' 'drama' 'documentary' 'short'
 'sport' 'comedy' 'western' 'documentary' 'drama']


In [41]:
print(test_data.columns)

Index(['ID', 'Title', 'Description', 'new_Description'], dtype='object')


In [42]:
submission_df = pd.DataFrame({
    'ID': test_data['ID'],
    'Predicted_Genre': y_test_pred_labels
})

submission_df.to_csv("movie_genre_predictions.csv", index=False)
print("Predictions saved successfully!")

Predictions saved successfully!


In [43]:
df = pd.read_csv("movie_genre_predictions.csv")
print(df.head())

   ID Predicted_Genre
0   1          comedy
1   2           drama
2   3     documentary
3   4           drama
4   5           drama
