# ðŸŽ¬ Movie Genre Prediction Using Machine Learning  
This notebook loads text-based movie plot summaries, trains a classifier using TF-IDF + Logistic Regression, and predicts genres for test data.


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [2]:
def load_train_data(path):
    ids, titles, genres, descriptions = [], [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 4:
                continue
            id_, title, genre, desc = parts
            ids.append(id_)
            titles.append(title)
            genres.append(genre)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "genre": genres,
        "description": descriptions,
    })


In [3]:
def load_test_data(path):
    ids, titles, descriptions = [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 3:
                continue
            id_, title, desc = parts
            ids.append(id_)
            titles.append(title)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "description": descriptions,
    })


In [9]:
train_df = load_train_data("/Users/kshitijmudey/Downloads/GenreClassificationDataset/train_data.txt")


In [10]:
X = train_df["description"]
y = train_df["genre"]


In [11]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=3000))
])


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import os

# ------------------------------------------------------
# SET YOUR FOLDER PATH
# ------------------------------------------------------
base_path = "/Users/kshitijmudey/Downloads/GenreClassificationDataset"

train_path = os.path.join(base_path, "train_data.txt")
test_path = os.path.join(base_path, "test_data.txt")
output_path = os.path.join(base_path, "test_data_solution.txt")


# ------------------------------------------------------
# FUNCTION TO LOAD TRAIN DATA
# ------------------------------------------------------
def load_train_data(path):
    ids, titles, genres, descriptions = [], [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 4:
                continue
            id_, title, genre, desc = parts
            ids.append(id_)
            titles.append(title)
            genres.append(genre)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "genre": genres,
        "description": descriptions,
    })


# ------------------------------------------------------
# FUNCTION TO LOAD TEST DATA
# ------------------------------------------------------
def load_test_data(path):
    ids, titles, descriptions = [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 3:
                continue
            id_, title, desc = parts
            ids.append(id_)
            titles.append(title)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "description": descriptions,
    })


# ------------------------------------------------------
# LOAD TRAIN DATA
# ------------------------------------------------------
print("Loading training data...")
train_df = load_train_data(train_path)
print(train_df.head())


# ------------------------------------------------------
# TRAIN MODEL (TF-IDF + LOGISTIC REGRESSION)
# ------------------------------------------------------
X = train_df["description"]
y = train_df["genre"]

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=3000))
])

print("Training model...")
model.fit(X, y)


# ------------------------------------------------------
# LOAD TEST DATA
# ------------------------------------------------------
print("Loading test data...")
test_df = load_test_data(test_path)
print(test_df.head())


# ------------------------------------------------------
# PREDICT GENRES FOR TEST DATA
# ------------------------------------------------------
print("Predicting test genres...")
predictions = model.predict(test_df["description"])


# ------------------------------------------------------
# SAVE OUTPUT FILE (test_data_solution.txt)
# ------------------------------------------------------
print(f"Saving predictions to: {output_path}")

with open(output_path, "w", encoding="utf-8") as f:
    for id_, title, genre in zip(test_df["id"], test_df["title"], predictions):
        f.write(f"{id_} ::: {title} ::: {genre}\n")

print("DONE! Output file created.")


Loading training data...
  id                             title     genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Training model...
Loading test data...
  id                        title  \
0  1         Edgar's Lunch (1998)   
1  2     La guerra de papÃ¡ (1977)   
2  3  Off the Beaten Track (2010)   
3  4       Meu Amigo Hindu (2015)   
4  5            Er nu zhai (1955)   

                                         description  
0  L.R. Brane 

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import os

# ------------------------------------------------------
# CHANGE THIS PATH TO MATCH YOUR FOLDER
# ------------------------------------------------------
BASE_PATH = "/Users/kshitijmudey/Downloads/GenreClassificationDataset"

TRAIN_PATH = os.path.join(BASE_PATH, "train_data.txt")
TEST_PATH = os.path.join(BASE_PATH, "test_data.txt")
OUTPUT_PATH = os.path.join(BASE_PATH, "test_data_solution.txt")

# ------------------------------------------------------
# FUNCTION TO LOAD TRAIN DATA
# ------------------------------------------------------
def load_train_data(path):
    ids, titles, genres, descriptions = [], [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 4:
                continue
            id_, title, genre, desc = parts
            ids.append(id_)
            titles.append(title)
            genres.append(genre)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "genre": genres,
        "description": descriptions,
    })

# ------------------------------------------------------
# FUNCTION TO LOAD TEST DATA
# ------------------------------------------------------
def load_test_data(path):
    ids, titles, descriptions = [], [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) != 3:
                continue
            id_, title, desc = parts
            ids.append(id_)
            titles.append(title)
            descriptions.append(desc)

    return pd.DataFrame({
        "id": ids,
        "title": titles,
        "description": descriptions,
    })


# ------------------------------------------------------
# LOAD TRAIN DATA
# ------------------------------------------------------
print("Loading training data...")
train_df = load_train_data(TRAIN_PATH)
print(train_df.head())

# ------------------------------------------------------
# TRAIN MODEL
# ------------------------------------------------------
print("Training model...")

X = train_df["description"]
y = train_df["genre"]

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=3000))
])

model.fit(X, y)

# ------------------------------------------------------
# LOAD TEST DATA
# ------------------------------------------------------
print("Loading test data...")
test_df = load_test_data(TEST_PATH)

# ------------------------------------------------------
# PREDICT GENRES
# ------------------------------------------------------
print("Predicting test genres...")
predictions = model.predict(test_df["description"])

# ------------------------------------------------------
# SAVE OUTPUT
# ------------------------------------------------------
print("Saving predictions to:", OUTPUT_PATH)

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for id_, title, pred in zip(test_df["id"], test_df["title"], predictions):
        f.write(f"{id_} ::: {title} ::: {pred}\n")

print("\nDone! ðŸŽ‰")
print("File saved as test_data_solution.txt")


Loading training data...
  id                             title     genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Training model...
Loading test data...
Predicting test genres...
Saving predictions to: /Users/kshitijmudey/Downloads/GenreClassificationDataset/test_data_solution.txt

Done! ðŸŽ‰
File saved as test_data_solution.txt
