In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Function to parse the train data
def parse_train_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                data.append({
                    'ID': parts[0],
                    'TITLE': parts[1],
                    'GENRE': parts[2],
                    'DESCRIPTION': parts[3]
                })
    return pd.DataFrame(data)

# Function to parse the test data
def parse_test_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 3:
                data.append({
                    'ID': parts[0],
                    'TITLE': parts[1],
                    'DESCRIPTION': parts[2]
                })
    return pd.DataFrame(data)

# Function to parse the test solution data correctly
def parse_test_solution(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                data.append({
                    'ID': parts[0],
                    'GENRE': parts[2]
                })
    return pd.DataFrame(data)

# File paths
train_file_path = 'C:\\Users\\ADMIN\\Desktop\\CODSOFT\\CODSOFT\\MOVIE GENRE CLAASIFICATION\\train_data.txt'
test_file_path = 'C:\\Users\\ADMIN\\Desktop\\CODSOFT\\CODSOFT\\MOVIE GENRE CLAASIFICATION\\test_data.txt'
solution_file_path = 'C:\\Users\\ADMIN\\Desktop\\CODSOFT\\CODSOFT\\MOVIE GENRE CLAASIFICATION\\test_data_solution.txt'

# Parse data
train_df = parse_train_data(train_file_path)
test_df = parse_test_data(test_file_path)
solution_df = parse_test_solution(solution_file_path)

# Debugging
print("Columns in solution_df:", solution_df.columns)
print("First few rows of solution_df:\n", solution_df.head())

# Preprocess using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_df['DESCRIPTION'])
X_test = vectorizer.transform(test_df['DESCRIPTION'])
y_train = train_df['GENRE']
y_test = solution_df['GENRE']

# Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict genres for the test data
y_pred = model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the model and vectorizer for future use
joblib.dump(model, 'genre_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


Columns in solution_df: Index(['ID', 'GENRE'], dtype='object')
First few rows of solution_df:
   ID        GENRE
0  1     thriller
1  2       comedy
2  3  documentary
3  4        drama
4  5        drama
Accuracy: 0.5838745387453874


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

      action       0.48      0.29      0.36      1314
       adult       0.60      0.24      0.34       590
   adventure       0.58      0.17      0.26       775
   animation       0.54      0.07      0.12       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.38      0.05      0.08       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.16       783
     fantasy       0.55      0.05      0.10       322
   game-show       0.90      0.51      0.65       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.61      2204
       music       0.68      0.45      0.54       731
     musical       0.33      0.02      0.04       276
     mystery       0.36      0.02      0.03       318
  

['tfidf_vectorizer.pkl']