In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
df_movies = pd.read_csv("df_movies.csv")

# Data cleaning required after importing of CSV
df_movies["genres"] = df_movies["genres"].str.replace("[ \[\]']", "").str.split(",")
df_movies.dropna(subset=["overview"], inplace=True)
df_movies.reset_index(inplace=True)

In [None]:
descriptions = df_movies["overview"]
descriptions

0       Taisto Kasurinen is a Finnish coal miner whose...
1       An episode in the life of Nikander, a garbage ...
2       While racing to a boxing match, Frank, Mike, J...
3       Timo Novotny labels his new project an experim...
4       Princess Leia is captured and held hostage by ...
                              ...                        
2563    Set in 1977, back when sex was safe, pleasure ...
2564    A family spends three summer days in a beautif...
2565    On the afternoon of July 1, 1981, Los Angeles ...
2566    A nurse from the Ukraine searches for a better...
2567    Anthony Richmond schemes to get the fortune of...
Name: overview, Length: 2568, dtype: object

In [None]:
# Import necessary packages for prediction
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Defining a function to predict the genre given a description and many other parameters
def find_genre(desc, vectorization, ngrams, num_neighbors, original):

  # Use the vectorization method of choice
  if vectorization == "count":
    vec = CountVectorizer(ngram_range=ngrams)
  elif vectorization == "tfidf":
    vec = TfidfVectorizer(norm=None, ngram_range=ngrams)
  else:
    return print("Invalid Vectorization")

  # Fit the vocabulary of the model
  vec.fit(descriptions)

  # Produce a sparse matrix of the word frequencies
  tf_sparse = vec.transform(descriptions)

  # Produce a sparse matrix of the given description
  desc_tf_sparse = vec.transform(pd.Series(desc))

  # Create a dataframe of the cosine similarities between the description and every other movie
  cos_sim = pd.DataFrame(cosine_similarity(desc_tf_sparse, tf_sparse)).T.sort_values(by=0, ascending=False)

  # Initialize list
  index_list = []

  # Grab the n nearest neighbors
  if original:
    index_list = cos_sim.iloc[0:num_neighbors - 1].index
  else:
    index_list = cos_sim.iloc[1:num_neighbors].index

  # Initialize list
  genre_list = []

  # For the indexes in the index_list 
  for index in index_list:

    # Add the genres of the movies at that index to the genre list
    genre_list += df_movies.iloc[index]['genres']

  # Return the most common genre
  return Counter(genre_list).most_common(1)[0][0]

In [None]:
# Define a mapper to produce if the prediction was correct
def prediction_correct(row):
  return row.predicted_genre in row.genres

In [None]:
# Define a function to calculate the proportion of genres that were correctly predicted in df_movies
def get_proportion(vectorization, ngrams, num_neighbors):

  # Initialize list
  genre_predictions = []

  # For i in the length of df_movies
  for i in range(df_movies.shape[0]):

    # Find the predicted genre of that movie's description and add it to the genre_predictions list
    genre_predictions += [find_genre(descriptions[i], vectorization, ngrams, num_neighbors, False)]
  
  # Add predicted genre as a column to df_movies
  df_movies["predicted_genre"] = genre_predictions

  # Check if each movie got a correct prediction or not
  df_movies["prediction_correct"] = df_movies.apply(prediction_correct, axis=1).astype(int)

  # Return the proportion of movies that got a correct prediction
  return df_movies["prediction_correct"].mean()

In [None]:
# Define a function to find the prediction correctness for various combinations of vecotrization, ngram range, and num neighbors
def optimize_model(frequency_list, ngram_list, neighbor_list):

  # Initialize an empty dataframe with these columns
  df_analysis = pd.DataFrame(columns=["Frequency Method", "Ngram Range", "Number Neighbors", "Proportion Correct"])

  # For each vectorization method you want to test
  for frequency in frequency_list:

    # For each ngram range you want to test
    for ngram in ngram_list:

      # For each number of neighbors you want to test
      for neighbor in neighbor_list:

        # Append a new row corresponding to this model to the dataframe
        to_append = [frequency, ngram, neighbor, get_proportion(frequency, ngram, neighbor)]
        a_series = pd.Series(to_append, index = df_analysis.columns)
        df_analysis = df_analysis.append(a_series, ignore_index=True)

  # Return the dataframe containing all the models of interest
  return df_analysis

In [None]:
# Create a dataframe that displays the differences in vectorization methods
df_count = optimize_model(["tfidf", "count"], [(1, 1), (1, 2)], [9, 10, 11])

In [None]:
# Create a dataframe that displays the differences in ngram range
df_ngram = optimize_model(["tfidf"], [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)], [10])

In [None]:
# Create a dataframe that displays the differences in number of neighbors
df_neighbors = optimize_model(["tfidf"], [(1, 1)], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [None]:
# Create a dataframe that displays the overall optimal model
df_overall = optimize_model(["tfidf"], [(1, 1), (1, 2), (1, 3)], [9, 10, 11])

In [None]:
from google.colab import files

# Download all the dataframes we just created
df_overall.to_csv('overall.csv')
files.download('overall.csv')

df_ngram.to_csv('ngram.csv')
files.download('ngram.csv')

df_count.to_csv('count.csv')
files.download('count.csv')

df_neighbors.to_csv('neighbors.csv')
files.download('neighbors.csv')


# For Presentation:

In [None]:
def find_genre(desc):
  vec = TfidfVectorizer(norm=None, ngram_range=(1, 1))
  vec.fit(descriptions)
  tf_sparse = vec.transform(descriptions)
  desc_tf_sparse = vec.transform(pd.Series(desc))
  cos_sim = pd.DataFrame(cosine_similarity(desc_tf_sparse, tf_sparse)).T.sort_values(by=0, ascending=False)
  index_list = cos_sim.iloc[0:9].index
  genre_list = []
  for index in index_list:
    genre_list += df_movies.iloc[index]['genres']
  return Counter(genre_list).most_common(1)[0][0]

# Quirky Predictions

In [None]:
find_genre("Space odyssey to the stars")

'ScienceFiction'

In [None]:
find_genre("A flock of funny flapjacks")

'Comedy'

In [None]:
find_genre("The FitnessGram PACER Test is a multistage aerobic capacity test that progressively gets more difficult as it continues.")

'Drama'