In [5]:
    import pandas as pd
    import re
    import pickle 
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report
    
    # Reading the description file to confirm formats (Documentation only)
    with open('Dataset/description.txt', 'r') as f:
        print("Dataset Description:\n", f.read())
    
    # Load Train Data
    train_df = pd.read_csv('Dataset/train_data.txt', sep=' ::: ', engine='python', names=['ID', 'Title', 'Genre', 'Description'])
    
    # Load Test Data
    test_df = pd.read_csv('Dataset/test_data.txt', sep=' ::: ', engine='python', names=['ID', 'Title', 'Description'])
    
    # Load Test Solution (Has Labels - for validation)
    test_sol_df = pd.read_csv('Dataset/test_data_solution.txt', sep=' ::: ', engine='python', names=['ID', 'Title', 'Genre', 'Description'])
    
    print(f"\nTraining Samples: {train_df.shape[0]}")
    print(f"Test Samples (to predict): {test_df.shape[0]}")
    
    # Data Preprocessing
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text
    
    train_df['cleaned_desc'] = train_df['Description'].apply(clean_text)
    test_df['cleaned_desc'] = test_df['Description'].apply(clean_text)
    
    # using TF-IDF for feature engineering
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    
    # Fit on training data
    X_train = tfidf.fit_transform(train_df['cleaned_desc'])
    y_train = train_df['Genre']
    
    # Transform test data (using the same vectorizer)
    X_test = tfidf.transform(test_df['cleaned_desc'])
    
    # Model training
    model = LogisticRegression(max_iter=1000)
    print("\nTraining Logistic Regression model...")
    model.fit(X_train, y_train)
    
    # Prediction on Test Data 
    print("Generating predictions for test_data.txt...")
    test_df['Predicted_Genre'] = model.predict(X_test)
    
    # Displaying first few predictions
    print("\nSample Predictions:")
    print(test_df[['Title', 'Predicted_Genre']].head())
    
    # We merge our predictions with the solution file based on ID to compare
    merged_df = test_df.merge(test_sol_df[['ID', 'Genre']], on='ID', suffixes=('_pred', '_actual'))
    
    accuracy = accuracy_score(merged_df['Genre'], merged_df['Predicted_Genre'])
    print(f"\nFinal Accuracy on Test Data: {accuracy:.2f}")
    
    print("\nClassification Report:\n")
    print(classification_report(merged_df['Genre'], merged_df['Predicted_Genre'], zero_division=0))
    
    with open('predictgenre.pkl', "wb") as f:
        pickle.dump(model, f)
        print(f"Model saved to predictgenre.pkl")

Dataset Description:
 Train data:
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION

Test data:
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION

Source:
ftp://ftp.fu-berlin.de/pub/misc/movies/database/

Training Samples: 54214
Test Samples (to predict): 54200

Training Logistic Regression model...
Generating predictions for test_data.txt...

Sample Predictions:
                         Title Predicted_Genre
0         Edgar's Lunch (1998)           short
1     La guerra de pap√° (1977)           drama
2  Off the Beaten Track (2010)     documentary
3       Meu Amigo Hindu (2015)           drama
4            Er nu zhai (1955)           drama

Final Accuracy on Test Data: 0.58

Classification Report:

              precision    recall  f1-score   support

      action       0.48      0.28      0.36      1314
       adu