In [1]:
import pandas as pd
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train_data.txt', sep=':::', engine='python', header=None, 
                    names=['id', 'title', 'genre', 'description'])

test = pd.read_csv('test_data_solution.txt', sep=':::', engine='python', header=None, 
                   names=['id', 'title', 'genre', 'description'])

In [3]:
print("Train Data:")
print(train.head())

Train Data:
   id                               title       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


In [4]:
print("Test Data:")
print(test.head())

Test Data:
   id                          title          genre  \
0   1          Edgar's Lunch (1998)       thriller    
1   2      La guerra de papá (1977)         comedy    
2   3   Off the Beaten Track (2010)    documentary    
3   4        Meu Amigo Hindu (2015)          drama    
4   5             Er nu zhai (1955)          drama    

                                         description  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text


In [6]:
train['clean_description'] = train['description'].apply(clean_text)
test['clean_description'] = test['description'].apply(clean_text)

In [7]:
le = LabelEncoder()
train['genre_encoded'] = le.fit_transform(train['genre'])
test['genre_encoded'] = le.transform(test['genre'])  # same encoding

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('clf', LogisticRegression(solver='liblinear', max_iter=1000))
])


In [9]:
X_train = train['clean_description']
y_train = train['genre_encoded']
pipeline.fit(X_train, y_train)

In [10]:
X_test = test['clean_description']
y_test = test['genre_encoded']
y_pred = pipeline.predict(X_test)


In [11]:
print("✅ Accuracy Score:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))


✅ Accuracy Score: 0.5796494464944649

✅ Classification Report:

               precision    recall  f1-score   support

      action        0.50      0.25      0.33      1314
       adult        0.60      0.19      0.29       590
   adventure        0.62      0.15      0.24       775
   animation        0.54      0.04      0.08       498
   biography        0.00      0.00      0.00       264
      comedy        0.52      0.57      0.55      7446
       crime        0.41      0.03      0.05       505
 documentary        0.66      0.86      0.75     13096
       drama        0.54      0.78      0.64     13612
      family        0.53      0.06      0.11       783
     fantasy        0.58      0.05      0.09       322
   game-show        0.91      0.50      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.65      0.56      0.60      2204
       music        0.69      0.41      0.51       731
     musical        0.25      0.01      0.03       276


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
def predict_genre(text):
    cleaned = clean_text(text)
    pred = pipeline.predict([cleaned])
    return le.inverse_transform(pred)[0]

In [13]:
sample_plot = "A young girl discovers magical creatures and must protect her world from evil forces."
print("\n Predicted Genre for Sample Plot:\n", predict_genre(sample_plot))


 Predicted Genre for Sample Plot:
  fantasy 


In [None]:
user_input = input("\nEnter a movie plot summary: ")
predicted_genre = predict_genre(user_input)

print("Predicted Genre:", predicted_genre)