In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import joblib
import kaggle

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Set up directory
os.makedirs('data/raw', exist_ok=True)
os.makedirs('figures', exist_ok=True)

# Change to project folder
os.chdir(r"C:\Users\YASH\AI,ML book\CodSoft\Task-1")
print("Now working in:", os.getcwd())


Now working in: C:\Users\YASH\AI,ML book\CodSoft\Task-1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YASH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\YASH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YASH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Download dataset using Kaggle API
'''
try:
    kaggle.api.dataset_download_files(
        'hijest/genre-classification-dataset-imdb',
        path='data/raw/',
        unzip=True
    )
    print("Dataset downloaded successfully!")
except:
    print("Dataset already exists or error in downloading")
'''

'\ntry:\n    kaggle.api.dataset_download_files(\n        \'hijest/genre-classification-dataset-imdb\',\n        path=\'data/raw/\',\n        unzip=True\n    )\n    print("Dataset downloaded successfully!")\nexcept:\n    print("Dataset already exists or error in downloading")\n'

## 1. Data Loading

In [3]:
# loading data from text file
# Load the training data
train_file = 'data/raw/Genre Classification Dataset/train_data.txt'
test_file = 'data/raw/Genre Classification Dataset/test_data.txt'
test_solution_file = 'data/raw/Genre Classification Dataset/test_data_solution.txt'

# Data loading function
def load_data(file_path):
    """Load data from text file"""
    ids = []
    titles = []
    genres = []
    descriptions = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ':::' in line:
                parts = line.strip().split(' ::: ')
                if len(parts) == 4:
                    ids.append(parts[0])
                    titles.append(parts[1])
                    genres.append(parts[2])
                    descriptions.append(parts[3])
                elif len(parts) == 3:
                    ids.append(parts[0])
                    titles.append(parts[1])
                    genres.append('')
                    descriptions.append(parts[2])
                else:
                    ids.append(parts[0])
                    titles.append(parts[1])
                    genres.append(parts[2])
                    descriptions.append(' ::: '.join(parts[3:]))
    
    return pd.DataFrame({
        'ID': ids,
        'Title': titles,
        'Genre': genres,
        'Description': descriptions
    })


## 2. Data cleaning

In [4]:
train_df = load_data(train_file)
test_df = load_data(test_file)
test_solution_df = load_data(test_solution_file)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Training data shape: (54214, 4)
Test data shape: (54200, 4)


In [5]:
print(train_df)


          ID                                       Title        Genre  \
0          1                Oscar et la dame rose (2009)        drama   
1          2                                Cupid (1997)     thriller   
2          3            Young, Wild and Wonderful (1980)        adult   
3          4                       The Secret Sin (1915)        drama   
4          5                      The Unrecovered (2007)        drama   
...      ...                                         ...          ...   
54209  54210                             "Bonino" (1953)       comedy   
54210  54211                 Dead Girls Don't Cry (????)       horror   
54211  54212   Ronald Goedemondt: Ze bestaan echt (2008)  documentary   
54212  54213                    Make Your Own Bed (1944)       comedy   
54213  54214  Nature's Fury: Storm of the Century (2006)      history   

                                             Description  
0      Listening in to a conversation between his doc...  
1    

In [6]:
print(test_df)

          ID                           Title Genre  \
0          1            Edgar's Lunch (1998)         
1          2        La guerra de papá (1977)         
2          3     Off the Beaten Track (2010)         
3          4          Meu Amigo Hindu (2015)         
4          5               Er nu zhai (1955)         
...      ...                             ...   ...   
54195  54196  "Tales of Light & Dark" (2013)         
54196  54197     Der letzte Mohikaner (1965)         
54197  54198             Oliver Twink (2007)         
54198  54199               Slipstream (1973)         
54199  54200       Curitiba Zero Grau (2010)         

                                             Description  
0      L.R. Brane loves his life - his car, his apart...  
1      Spain, March 1964: Quico is a very naughty chi...  
2      One year in the life of Albin and his family o...  
3      His father has died, he hasn't spoken with his...  
4      Before he was known internationally as a marti...

In [None]:
# data preprocessing

nltk.download('stopwords')

def enhanced_text_processing(text):
    """Enhanced text processing with lemmatization and better tokenization"""
    if isinstance(text, str):
        # Lowercase
        text = text.lower()
        
        # Remove special characters but keep basic punctuation and numbers
        text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?]', '', text)
        
        # Tokenization
        words = word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words and len(word) > 2]
        
        # Lemmatization (better than stemming)
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    else:
        return ""

print("Applying enhanced text preprocessing...")
train_df['clean_combined'] = train_df['Description'].apply(enhanced_text_processing)
test_df['clean_combined'] = test_df['Description'].apply(enhanced_text_processing)

# Combine title and description for analysis
train_df['combined_text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['combined_text'] = test_df['Title'] + ' ' + test_df['Description']

Applying enhanced text preprocessing...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YASH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
print("\nBasic Info:")
print(train_df.info())
print(f"\n{'='*10}test data info{'='*10}")
print(test_df.info())

In [None]:
print("\nMissing Values:")
print(train_df.isnull().sum())

print(f"\n{'='*10}test data missing values{'='*10}")
print(test_df.isnull().sum())

In [None]:
print("\nDuplicate Values:")
print(train_df.duplicated().sum())
print(f"\n{'='*10}test data duplicate values{'='*10}")
print(test_df.duplicated().sum())

In [None]:
# Genra distribution analysis

print("\nGenre Distribution:")
genre_counts = train_df['Genre'].value_counts()
print(genre_counts)

## 4. EDA (Exploratory data Analysis)

In [None]:
# Visualize genre distribution

plt.figure(figsize=(12, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis')
plt.title('Distribution of Movie Genres', fontsize=16, fontweight='bold')
plt.xlabel('Count', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.tight_layout()
plt.savefig('figures/genre_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# 3. Text Length Analysis
# Text length analysis
train_df['text_length'] = train_df['clean_combined'].str.len()
train_df['word_count'] = train_df['clean_combined'].str.split().str.len()

print(f"\nAverage Text Length: {train_df['text_length'].mean():.0f} characters")
print(f"Average Word Count: {train_df['word_count'].mean():.0f} words")

In [None]:
# Handle class imbalance by keeping only major genres
major_genres = genre_counts[genre_counts > 1000].index
train_df_filtered = train_df[train_df['Genre'].isin(major_genres)]

print(f"\nGenres after filtering: {len(major_genres)}")
print(f"Training data shape after filtering: {train_df_filtered.shape}")

# Align test data with test solution for major genres
test_solution_filtered = test_solution_df[test_solution_df['Genre'].isin(major_genres)]
test_ids_with_solutions = test_solution_filtered['ID']
test_df_filtered = test_df[test_df['ID'].isin(test_ids_with_solutions)]

print(f"Test data shape after filtering: {test_df_filtered.shape}")
print(f"Test solution shape after filtering: {test_solution_filtered.shape}")

In [None]:
# Visualize Description length by genre
# Text length by genre
plt.figure(figsize=(12, 6))
train_df.boxplot(column='word_count', by='Genre', figsize=(12, 6))
plt.title('Word Count Distribution by Genre')
plt.suptitle('')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/word_count_by_genre.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Model Building

In [None]:
# prepare the data

X = train_df_filtered['clean_combined']
y = train_df_filtered['Genre']

X_test = test_df_filtered['clean_combined']
y_test = test_solution_filtered['Genre']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
## 6. Model Training with Hyperparameter Tuning

# Calculate class weights for handling imbalance
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Define models with hyperparameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(
            random_state=42, 
            class_weight=class_weight_dict,
            max_iter=1000
        ),
        'params': {
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'model__C': [0.1, 1, 10]
        }
    },
    'LinearSVC': {
        'model': LinearSVC(
            random_state=42, 
            class_weight=class_weight_dict,
            max_iter=10000
        ),
        'params': {
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'model__C': [0.1, 1, 10]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'model__alpha': [0.1, 0.5, 1.0]
        }
    }
}


In [None]:
print("\n" + "="*50)
print("TRAINING MODELS")
print("="*50)

# Train and evaluate models
best_model = None
best_accuracy = 0
results = {}

print("\nTraining models with hyperparameter tuning...")

for name, model_info in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
        ('model', model_info['model'])
    ])
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        pipeline, 
        model_info['params'], 
        cv=3, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Best model from grid search
    best_pipeline = grid_search.best_estimator_
    
    # Validate
    val_pred = best_pipeline.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_pred)
    
    results[name] = {
        'model': best_pipeline,
        'val_accuracy': val_accuracy,
        'best_params': grid_search.best_params_
    }
    
    print(f"{name} Validation Accuracy: {val_accuracy:.4f}")
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Update best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = best_pipeline
        best_model_name = name

print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")

In [None]:
## 7. Final Model Evaluation

print("\nEvaluating best model on test set...")

# Verify test data alignment
print(f"Test features shape: {X_test.shape}")
print(f"Test target shape: {y_test.shape}")

if X_test.shape[0] == y_test.shape[0]:
    # Test the best model
    test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_pred)

    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, test_pred))

    # Confusion Matrix
    plt.figure(figsize=(12, 8))
    cm = confusion_matrix(y_test, test_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=best_model.classes_, 
                yticklabels=best_model.classes_)
    plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig('figures/confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
else:
    print(f"Error: Test data misaligned! Features: {X_test.shape[0]}, Target: {y_test.shape[0]}")


In [None]:
## 8. Model Persistence

# Save the best model
model_filename = f'models/best_genre_classifier_{best_model_name}.pkl'
joblib.dump(best_model, model_filename)
print(f"\nBest model saved as: {model_filename}")

# Save the TF-IDF vectorizer separately for potential reuse
vectorizer_filename = 'models/tfidf_vectorizer.pkl'
if hasattr(best_model, 'named_steps'):
    vectorizer = best_model.named_steps['tfidf']
    joblib.dump(vectorizer, vectorizer_filename)
    print(f"TF-IDF vectorizer saved as: {vectorizer_filename}")

In [None]:
## 9. Model Comparison and Results

print("\n" + "="*50)
print("MODEL COMPARISON RESULTS")
print("="*50)

for name, result in results.items():
    print(f"{name}:")
    print(f"  Validation Accuracy: {result['val_accuracy']:.4f}")
    print(f"  Best Parameters: {result['best_params']}")

print(f"\nFINAL RESULTS:")
print(f"Best Model: {best_model_name}")
print(f"Validation Accuracy: {best_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
## 10. Prediction Function for New Data

def predict_genre(text, model=best_model):
    """Predict genre for new text"""
    if model is not None:
        if hasattr(model, 'predict'):
            prediction = model.predict([text])[0]
            
            # Get probabilities if available
            if hasattr(model, 'predict_proba'):
                probabilities = model.predict_proba([text])[0]
                confidence = max(probabilities)
                all_probs = dict(zip(model.classes_, probabilities))
            else:
                confidence = 1.0
                all_probs = {}
            
            result = {
                'predicted_genre': prediction,
                'confidence': confidence,
                'all_probabilities': all_probs
            }
            return result
    return {"error": "Model not available for predictions"}

In [None]:
# sample
sample_text = "A group of friends go on an adventure to find hidden treasure in the mountains."
prediction = predict_genre(sample_text)
print(f"\nExample Prediction:")
print(f"Text: {sample_text}")
print(f"Predicted Genre: {prediction['predicted_genre']}")
print(f"Confidence: {prediction['confidence']:.4f}")

print("\nModel training and evaluation completed successfully!")