In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
import re

data = []
file_path = '/home/hariom/Downloads/dataset/Genre Classification Dataset/train_data.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = re.split(r'\s*:::\s*', line, maxsplit=3)
        if len(parts) == 4:
            data.append(parts)
        else:
            print(f"Skipped line due to unexpected format: {line}")


In [3]:
train_data = pd.DataFrame(data, columns=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

In [4]:
train_data.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [5]:
train_data = train_data.drop(2).reset_index(drop=True)
train_data.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
3,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
4,6,Quality Control (2011),documentary,Quality Control consists of a series of 16mm s...


In [6]:
train_data.shape

(54213, 4)

In [7]:
train_data['GENRE'].value_counts()

GENRE
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            589
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64

In [8]:
label_encode = LabelEncoder()
labels = label_encode.fit_transform(train_data['GENRE'])

In [9]:
train_data['GENRE'] = labels
train_data['GENRE'].value_counts()

GENRE
8     13613
7     13096
5      7447
21     5073
13     2204
24     1591
0      1315
26     1032
18      884
9       784
2       775
14      731
19      672
20      647
1       589
6       505
3       498
22      432
23      391
10      323
16      319
15      277
4       265
12      243
11      194
17      181
25      132
Name: count, dtype: int64

In [10]:
min_count = 132
new_data = pd.DataFrame(columns=train_data.columns)
for genre in train_data['GENRE'].unique():
    genre_data = train_data[train_data['GENRE'] == genre]
    undersampled_genre_data = genre_data.sample(n=min_count, random_state=42, replace=False)
    new_data = pd.concat([new_data, undersampled_genre_data])
new_data = new_data.sample(frac=1, random_state=42).reset_index(drop=True)
print(new_data['GENRE'].value_counts())

GENRE
9     132
24    132
2     132
10    132
17    132
18    132
13    132
19    132
4     132
15    132
1     132
3     132
12    132
26    132
6     132
0     132
7     132
21    132
22    132
8     132
14    132
23    132
25    132
5     132
20    132
11    132
16    132
Name: count, dtype: int64


In [11]:
new_data.shape

(3564, 4)

In [12]:
new_data.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,40266,Princess Ha Cha and the Meadow of Bloom (2009),9,This is a story about a five (5) year old litt...
1,21900,Das Tattoo - Tödliche Zeichen (2000),24,Three divers find a centuries-old completely t...
2,22129,Los Buscadores (2017),2,"Los Buscadores follows Manu (Tomás Arredondo),..."
3,6595,"Hol volt, hol nem volt (1987)",10,"Shot in B&W, Gyula Gazdag's film follows the s..."
4,18764,"""Rough Cut LA"" (2005)",17,Rough Cut LA jumps right in the middle of L.A....


In [13]:
Y= new_data['GENRE']
Y = pd.to_numeric(Y, errors='coerce')
X=new_data['DESCRIPTION']
print(X)

0       This is a story about a five (5) year old litt...
1       Three divers find a centuries-old completely t...
2       Los Buscadores follows Manu (Tomás Arredondo),...
3       Shot in B&W, Gyula Gazdag's film follows the s...
4       Rough Cut LA jumps right in the middle of L.A....
                              ...                        
3559    A poor amaZulu boy, sold into slavery by his m...
3560    Two kids have a sleepover and wake up over a g...
3561    Suddenly appearing in Florence, an evil seduct...
3562    During the Korean War, a glory-hunting sergean...
3563    Jonathan Ross narrates the global talent show ...
Name: DESCRIPTION, Length: 3564, dtype: object


In [14]:
print(Y)

0        9
1       24
2        2
3       10
4       17
        ..
3559     3
3560     0
3561    13
3562    25
3563    11
Name: GENRE, Length: 3564, dtype: int64


In [15]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 242600 stored elements and shape (3564, 28799)>
  Coords	Values
  (0, 540)	0.058138149735313574
  (0, 1386)	0.044372578559090825
  (0, 2668)	0.05221348496290582
  (0, 2721)	0.07949242669298925
  (0, 2722)	0.08425390790194229
  (0, 3403)	0.09892767049039315
  (0, 3823)	0.1105490470758384
  (0, 3953)	0.15445594803721058
  (0, 3973)	0.1574451664473008
  (0, 4121)	0.06281911719423071
  (0, 4550)	0.3297587815130163
  (0, 6172)	0.15073566030042013
  (0, 6630)	0.07111555762481955
  (0, 9674)	0.13080110327822606
  (0, 9785)	0.08877449781220105
  (0, 10275)	0.07563449870555225
  (0, 10525)	0.1261684871250346
  (0, 10788)	0.07529440600307906
  (0, 11375)	0.3180184244750787
  (0, 11898)	0.09521175922502942
  (0, 11925)	0.21180219102801143
  (0, 11941)	0.09374000006931797
  (0, 12595)	0.3463058853872547
  (0, 12616)	0.15900921223753936
  (0, 12636)	0.16487939075650815
  :	:
  (3563, 13841)	0.16907424167507235
  (3563, 14726)	0.158576097

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [17]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(2851, 28799) (713, 28799) (2851,) (713,)


In [18]:
print(Y_train.unique())  # Check unique values in Y_train
print(Y_train.dtype)     # Check the data type of Y_train

[ 4  6  9  5 15 11  2  8 14 21  7 23  0 18 19 17 10 24  3 12 16 20 26 13
 25  1 22]
int64


In [19]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [20]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [21]:
train_accuracy = accuracy_score(Y_train, Y_train_pred)
test_accuracy = accuracy_score(Y_test, Y_test_pred)

In [22]:
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

Training Accuracy: 0.97
Test Accuracy: 0.40


In [23]:
def predict_genre(user_input):
    user_input_tfidf = vectorizer.transform([user_input])
    final_prediction = model.predict(user_input_tfidf)
    return final_prediction

In [24]:
user_input = input("Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. �1.50 SP:Tyrone,,,")
movie_genre = predict_genre(user_input)
print(movie_genre)

[17]


In [25]:
if movie_genre == 0:
    print("Predicted Genre: Action")
elif movie_genre == 1:
    print("Predicted Genre: Adult")
elif movie_genre == 2:
    print("Predicted Genre: Adventure")
elif movie_genre == 3:
    print("Predicted Genre: Animation")
elif movie_genre == 4:
    print("Predicted Genre: Biography")
elif movie_genre == 5:
    print("Predicted Genre: Comedy")
elif movie_genre == 6:
    print("Predicted Genre: Crime")
elif movie_genre == 7:
    print("Predicted Genre: Documentary")
elif movie_genre == 8:
    print("Predicted Genre: Drama")
elif movie_genre == 9:
    print("Predicted Genre: Family")
elif movie_genre == 10:
    print("Predicted Genre: Fantasy")
elif movie_genre == 11:
    print("Predicted Genre: Game Show")
elif movie_genre == 12:
    print("Predicted Genre: History")
elif movie_genre == 13:
    print("Predicted Genre: Horror")
elif movie_genre == 14:
    print("Predicted Genre: Music")
elif movie_genre == 15:
    print("Predicted Genre: Musical")
elif movie_genre == 16:
    print("Predicted Genre: Mystery")
elif movie_genre == 17:
    print("Predicted Genre: News")
elif movie_genre == 18:
    print("Predicted Genre: Reality TV")
elif movie_genre == 19:
    print("Predicted Genre: Romance")
elif movie_genre == 20:
    print("Predicted Genre: Sci-Fi")
elif movie_genre == 21:
    print("Predicted Genre: Short")
elif movie_genre == 22:
    print("Predicted Genre: Sport")
elif movie_genre == 23:
    print("Predicted Genre: Talk Show")
elif movie_genre == 24:
    print("Predicted Genre: Thriller")
elif movie_genre == 25:
    print("Predicted Genre: War")
elif movie_genre == 26:
    print("Predicted Genre: Western")
else:
    print("Unknown genre label")

Predicted Genre: News
