In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [5]:
data1 = pd.read_csv('description.txt', delimiter='\t')  # if tab-separated
data2 = pd.read_csv('train_data.txt', delimiter='\t')
data3 = pd.read_csv('test_data.txt', delimiter='\t')
data4 = pd.read_csv('test_data_solution.txt', delimiter='\t')




In [6]:
print(data1.head())
print(data2.head())
print(data3.head())
print(data4.head())

                              Train data:
0  ID ::: TITLE ::: GENRE ::: DESCRIPTION
1  ID ::: TITLE ::: GENRE ::: DESCRIPTION
2  ID ::: TITLE ::: GENRE ::: DESCRIPTION
3  ID ::: TITLE ::: GENRE ::: DESCRIPTION
4                              Test data:
  1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
0  2 ::: Cupid (1997) ::: thriller ::: A brother ...                                                                                                   

In [7]:
data1_cleaned = data1['Train data:'].str.split(' ::: ', expand=True)
# Rename the columns
data1_cleaned.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
# Drop the rows with missing genre (test data has no genre)
data1_cleaned = data1_cleaned.dropna(subset=['GENRE'])
# Preview cleaned data
print(data1_cleaned.head())

   ID  TITLE        GENRE  DESCRIPTION
0  ID  TITLE        GENRE  DESCRIPTION
1  ID  TITLE        GENRE  DESCRIPTION
2  ID  TITLE        GENRE  DESCRIPTION
3  ID  TITLE        GENRE  DESCRIPTION
5  ID  TITLE  DESCRIPTION         None


In [8]:
data2_cleaned = data2.iloc[:, 0].str.split(' ::: ', expand=True)
data2_cleaned.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

In [9]:
data3_cleaned = data3.iloc[:, 0].str.split(' ::: ', expand=True)

# Assign column names as needed
data3_cleaned.columns = ['ID', 'TITLE', 'DESCRIPTION']  # Test data has no GENRE

# Display the cleaned DataFrame
print(data3_cleaned.head())

  ID                        TITLE  \
0  2     La guerra de papá (1977)   
1  3  Off the Beaten Track (2010)   
2  4       Meu Amigo Hindu (2015)   
3  5            Er nu zhai (1955)   
4  6           Riddle Room (2016)   

                                         DESCRIPTION  
0  Spain, March 1964: Quico is a very naughty chi...  
1  One year in the life of Albin and his family o...  
2  His father has died, he hasn't spoken with his...  
3  Before he was known internationally as a marti...  
4  Emily Burns is being held captive in a room wi...  


In [10]:
data4_cleaned = data4.iloc[:, 0].str.split(' ::: ', expand=True)

# Assign column names as needed
data4_cleaned.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']  # Include the genre for this dataset

# Display the cleaned DataFrame
print(data4_cleaned.head())

  ID                        TITLE        GENRE  \
0  2     La guerra de papá (1977)       comedy   
1  3  Off the Beaten Track (2010)  documentary   
2  4       Meu Amigo Hindu (2015)        drama   
3  5            Er nu zhai (1955)        drama   
4  6           Riddle Room (2016)       horror   

                                         DESCRIPTION  
0  Spain, March 1964: Quico is a very naughty chi...  
1  One year in the life of Albin and his family o...  
2  His father has died, he hasn't spoken with his...  
3  Before he was known internationally as a marti...  
4  Emily Burns is being held captive in a room wi...  


In [11]:
data_train = pd.concat([data1_cleaned, data2_cleaned], ignore_index=True)
data_test = pd.merge(data3_cleaned, data4_cleaned, on=['ID', 'TITLE'], how='left')

In [12]:
print(data_train.head())
print(data_test.head())

   ID  TITLE        GENRE  DESCRIPTION
0  ID  TITLE        GENRE  DESCRIPTION
1  ID  TITLE        GENRE  DESCRIPTION
2  ID  TITLE        GENRE  DESCRIPTION
3  ID  TITLE        GENRE  DESCRIPTION
4  ID  TITLE  DESCRIPTION         None
  ID                        TITLE  \
0  2     La guerra de papá (1977)   
1  3  Off the Beaten Track (2010)   
2  4       Meu Amigo Hindu (2015)   
3  5            Er nu zhai (1955)   
4  6           Riddle Room (2016)   

                                       DESCRIPTION_x        GENRE  \
0  Spain, March 1964: Quico is a very naughty chi...       comedy   
1  One year in the life of Albin and his family o...  documentary   
2  His father has died, he hasn't spoken with his...        drama   
3  Before he was known internationally as a marti...        drama   
4  Emily Burns is being held captive in a room wi...       horror   

                                       DESCRIPTION_y  
0  Spain, March 1964: Quico is a very naughty chi...  
1  One year in the

In [13]:
print(data_train.columns)
print(data_test.columns)

Index(['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], dtype='object')
Index(['ID', 'TITLE', 'DESCRIPTION_x', 'GENRE', 'DESCRIPTION_y'], dtype='object')


In [14]:
data_test.rename(columns={'Description': 'DESCRIPTION'}, inplace=True)


In [15]:
print(data_test[['DESCRIPTION_x', 'DESCRIPTION_y']].head())

                                       DESCRIPTION_x  \
0  Spain, March 1964: Quico is a very naughty chi...   
1  One year in the life of Albin and his family o...   
2  His father has died, he hasn't spoken with his...   
3  Before he was known internationally as a marti...   
4  Emily Burns is being held captive in a room wi...   

                                       DESCRIPTION_y  
0  Spain, March 1964: Quico is a very naughty chi...  
1  One year in the life of Albin and his family o...  
2  His father has died, he hasn't spoken with his...  
3  Before he was known internationally as a marti...  
4  Emily Burns is being held captive in a room wi...  


In [16]:
X_train = data_train['DESCRIPTION']
y_train = data_train['GENRE']

X_test = data_test['DESCRIPTION_x']  # Use the correct column
y_test = data_test['GENRE']



In [17]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())

4
0


In [18]:
X_train.fillna('', inplace=True)
X_test.fillna('', inplace=True)

In [19]:
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)

In [20]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [21]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.5239026550305356


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Classification Report:
               precision    recall  f1-score   support

      action       0.55      0.11      0.18      1314
       adult       0.51      0.06      0.11       590
   adventure       0.81      0.07      0.13       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.51      0.42      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.87      0.69     13096
       drama       0.46      0.82      0.59     13612
      family       0.50      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.98      0.32      0.48       193
     history       0.00      0.00      0.00       243
      horror       0.69      0.36      0.47      2204
       music       0.74      0.15      0.25       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.5837561578626912


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

      action       0.48      0.29      0.37      1314
       adult       0.60      0.24      0.34       590
   adventure       0.59      0.17      0.26       775
   animation       0.52      0.06      0.11       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.36      0.04      0.07       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.16       783
     fantasy       0.55      0.05      0.09       322
   game-show       0.88      0.51      0.65       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.61      2204
       music       0.67      0.45      0.54       731
     musical       0.32      0.02      0.04       276
     mystery       0.31      0.01    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

In [None]:
def generate_word_cloud(data, genre):
    text = ' '.join(data[data['GENRE'] == genre]['DESCRIPTION'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Genre: {genre}')
    plt.show()

# Get unique genres
unique_genres = data_train['GENRE'].unique()

# Generate word clouds for each genre
for genre in unique_genres:
    generate_word_cloud(data_train, genre)