In [12]:
import pandas as pd

movies = pd.read_csv('../input/movies/movies.csv')  # Assuming you're using the MovieLens dataset
links = pd.read_csv('../input/movies/links.csv')  # Optional, for connecting to external sources like IMDb
tags = pd.read_csv('../input/movies/tags.csv')    # Tags for textual descriptions (Optional)
ratings = pd.read_csv('../input/movies/ratings.csv')  # User ratings data

# Display movie data
print(movies.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [13]:
# Since I couldn't find the file containing description of movies, I am creating a textual description column by combining `title`, `genres`, and `tags`
movies['description'] = movies['title'] + ' ' + movies['genres']

# If `tags` exist, join them to `description`
if 'tags' in locals():
    tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
    movies = movies.merge(tags_grouped, on='movieId', how='left')
    movies['description'] = movies['description'] + ' ' + movies['tag']

# Fill missing values in description
movies['description'].fillna('', inplace=True)

# Display the combined dataset
print(movies[['movieId', 'description']].head())


   movieId                                        description
0        1  Toy Story (1995) Adventure|Animation|Children|...
1        2  Jumanji (1995) Adventure|Children|Fantasy fant...
2        3   Grumpier Old Men (1995) Comedy|Romance moldy old
3        4                                                   
4        5  Father of the Bride Part II (1995) Comedy preg...


In [18]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: Required for some WordNet functionalities


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
import os

folder_path = './nltk_data' 
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created successfully.")
else:
    print(f"Folder '{folder_path}' already exists.")

Folder './nltk_data' created successfully.


In [21]:
import nltk
nltk.data.path.append('./nltk_data')  # Add the path to NLTK data location
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('wordnet', download_dir='./nltk_data')
nltk.download('omw-1.4', download_dir='./nltk_data')

[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data] Downloading package omw-1.4 to ./nltk_data...


True

In [23]:
import nltk
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('wordnet', download_dir='./nltk_data')
nltk.download('omw-1.4', download_dir='./nltk_data')


[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ./nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [25]:
import os


os.listdir('.')

for root, dirs, files in os.walk("."):
    print(f"Directory: {root}")
    for file in files:
        print(f"  - {file}")


Directory: .
Directory: ./.virtual_documents
Directory: ./nltk_data
Directory: ./nltk_data/corpora
  - wordnet.zip
  - omw-1.4.zip
  - stopwords.zip
Directory: ./nltk_data/corpora/stopwords
  - catalan
  - danish
  - indonesian
  - dutch
  - bengali
  - hinglish
  - tajik
  - italian
  - slovene
  - finnish
  - french
  - english
  - portuguese
  - swedish
  - hungarian
  - basque
  - arabic
  - turkish
  - romanian
  - kazakh
  - chinese
  - norwegian
  - README
  - russian
  - nepali
  - hebrew
  - german
  - greek
  - azerbaijani
  - spanish


In [26]:
import zipfile
import os


nltk_data_dir = './nltk_data/corpora/'

zip_files = ['wordnet.zip', 'omw-1.4.zip', 'stopwords.zip']

for zip_file in zip_files:
    with zipfile.ZipFile(os.path.join(nltk_data_dir, zip_file), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(nltk_data_dir))

print("Unzipped all the files successfully.")


Unzipped all the files successfully.


In [27]:
import nltk

# Check if WordNet is available
print("WordNet available:", nltk.data.find("corpora/wordnet"))

# Check if stopwords are available
print("Stopwords available:", nltk.data.find("corpora/stopwords"))


WordNet available: /kaggle/working/nltk_data/corpora/wordnet
Stopwords available: /usr/share/nltk_data/corpora/stopwords


In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load NLTK resources
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('wordnet', download_dir='./nltk_data')
nltk.download('omw-1.4', download_dir='./nltk_data')

# Preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'description' column
if "description" in movies.columns:
    movies['cleaned_description'] = movies['description'].apply(preprocess_text)
    print(movies[['description', 'cleaned_description']].head())
else:
    print("Description column not found in the dataframe.")


[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ./nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
                                         description  \
0  Toy Story (1995) Adventure|Animation|Children|...   
1  Jumanji (1995) Adventure|Children|Fantasy fant...   
2   Grumpier Old Men (1995) Comedy|Romance moldy old   
3                                                      
4  Father of the Bride Part II (1995) Comedy preg...   

                                 cleaned_description  
0  Toy Story (1995) Adventure|Animation|Children|...  
1  Jumanji (1995) Adventure|Children|Fantasy fant...  
2   Grumpier Old Men (1995) Comedy|Romance moldy old  
3                                                     
4  Father Bride Part II (1995) Comedy pregnancy r..

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Use top 5000 features

# Transform descriptions into TF-IDF feature matrix
X = tfidf.fit_transform(movies['cleaned_description']).toarray()

# Print the feature matrix
print(X.shape)


(9742, 3787)


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'genre' is the column containing the categories (update based on your dataset)
labels = movies['genres']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize Logistic Regression Model
model = LogisticRegression(max_iter=500)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.14520266803488968
Classification Report:
                                                                 precision    recall  f1-score   support

                                            (no genres listed)       0.00      0.00      0.00         7
                                                        Action       0.00      0.00      0.00        10
                                              Action|Adventure       0.00      0.00      0.00         5
                                    Action|Adventure|Animation       0.00      0.00      0.00         1
                           Action|Adventure|Animation|Children       0.00      0.00      0.00         1
                    Action|Adventure|Animation|Children|Comedy       0.00      0.00      0.00         3
             Action|Adventure|Animation|Children|Comedy|Sci-Fi       0.00      0.00      0.00         1
            Action|Adventure|Animation|Children|Fantasy|Sci-Fi       0.00      0.00      0.00         2
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
def recommend_category(description):
   
    cleaned_description = preprocess_text(description)

   
    vectorized_description = tfidf.transform([cleaned_description]).toarray()

    # Predict the category using the trained model
    predicted_category = model.predict(vectorized_description)
    
    return predicted_category[0]

# Example
new_movie_description = "A young programmer is hired to protect a company from hackers."
category = recommend_category(new_movie_description)
print(f"Recommended Category: {category}")


Recommended Category: Drama


In [33]:
def recommend_category_from_dataset():
 
    movies['cleaned_description'] = movies['description'].apply(preprocess_text)
    vectorized_descriptions = tfidf.transform(movies['cleaned_description']).toarray()


    predicted_categories = model.predict(vectorized_descriptions)
    
 
    movies['predicted_category'] = predicted_categories
    
    return movies[['movieId', 'title', 'predicted_category']]


predicted_movies = recommend_category_from_dataset()


print(predicted_movies.head())


   movieId                               title predicted_category
0        1                    Toy Story (1995)             Comedy
1        2                      Jumanji (1995)              Drama
2        3             Grumpier Old Men (1995)     Comedy|Romance
3        4            Waiting to Exhale (1995)             Comedy
4        5  Father of the Bride Part II (1995)             Comedy
