In [124]:
import numpy as np
import os
import pandas as pd
import requests
import time
import zipfile

from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import clear_output
from openai import AzureOpenAI

pd.set_option('display.max_colwidth', None)

## List all datasets

In [106]:
!kaggle datasets list

ref                                                             title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
rahulvyasm/netflix-movies-and-tv-shows                          Netflix Movies and TV Shows                         1MB  2024-04-10 09:48:38          12918        274  1.0              
sahirmaharajj/school-student-daily-attendance                   School Student Daily Attendance                     2MB  2024-04-29 19:29:56           1679         40  1.0              
jaidalmotra/pokemon-dataset                                     Pokemon Dataset                                    19KB  2024-04-30 10:38:36           1116         36  1.0              
fahadrehman07/retail-transaction-dataset                        Retail

## Downloading a dataset

In [6]:
!kaggle datasets download -d "rahulvyasm/netflix-movies-and-tv-shows"

Dataset URL: https://www.kaggle.com/datasets/rahulvyasm/netflix-movies-and-tv-shows
License(s): CC0-1.0
Downloading netflix-movies-and-tv-shows.zip to /Users/madelonhulsebos/Documents/data_search/HITS-system-implementation/data
 74%|████████████████████████████▏         | 1.00M/1.35M [00:00<00:00, 1.88MB/s]
100%|██████████████████████████████████████| 1.35M/1.35M [00:00<00:00, 2.36MB/s]


In [8]:
archive = zipfile.ZipFile('netflix-movies-and-tv-shows.zip', 'r')

In [98]:
# Unfortunately only the dataset is available, not the metadata
archive.namelist()

['netflix_titles.csv']

In [99]:
archive.extract('netflix_titles.csv')

'/Users/madelonhulsebos/Documents/data_search/HITS-system-implementation/data/netflix_titles.csv'

In [104]:
dataset = pd.read_csv("netflix_titles.csv", encoding='unicode_escape').dropna(axis=1)

In [105]:
dataset

Unnamed: 0,show_id,type,title,release_year,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,2020,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,s2,TV Show,Blood & Water,2021,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
2,s3,TV Show,Ganglands,2021,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
3,s4,TV Show,Jailbirds New Orleans,2021,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
4,s5,TV Show,Kota Factory,2021,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train Indiaâs finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."
...,...,...,...,...,...,...
8804,s8805,Movie,Zombieland,2009,"Comedies, Horror Movies","Looking to survive in a world taken over by zombies, a dorky college student teams with an urban roughneck and a pair of grifter sisters."
8805,s8806,Movie,Zoom,2006,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero must train a new crop of youthful saviors when the military preps for an attack by a familiar villain."
8806,s8807,Movie,Zubaan,2015,"Dramas, International Movies, Music & Musicals","A scrappy but poor boy worms his way into a tycoon's dysfunctional family, while facing his fear of music and the truth about his past."
8807,s8808,TV Show,Parasyte: The Grey,2024,"Sci-fi, Horror, Action","A new breed of parasitic aliens arrive on Earth and infiltrate human bodies, leaving a trail of terror in their wake."


## Get the metadata

As only the dataset itself is available (we should check if a dataset, or set of datasets) is a CSV, if so, we can scrape the page for metadata.

In [15]:
r = requests.get("https://www.kaggle.com/datasets/rahulvyasm/netflix-movies-and-tv-shows")

In [39]:
bs = BeautifulSoup(r.text)

In [53]:
bs


<!DOCTYPE html>

<html lang="en">
<head>
<title>Netflix Movies and TV Shows | Kaggle</title>
<meta charset="utf-8"/>
<meta content="index, follow" name="robots"/>
<meta content="Exploring the Depths of Netflix: A Comprehensive Dataset of Movies and TV Shows" name="description"/>
<meta content="no-cache" name="turbolinks-cache-control"/>
<meta content="arts and entertainment,movies and tv shows,data visualization,exploratory data analysis,artificial intelligence" name="keywords"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=5.0, minimum-scale=1.0" name="viewport"/>
<meta content="#008ABC" name="theme-color">
<script nonce="wXHGw2J4q0APeqx8Ri51bw==" type="text/javascript">
    window["pageRequestStartTime"] = 1715004753223;
    window["pageRequestEndTime"] = 1715004753337;
    window["initialPageLoadStartTime"] = new Date().getTime();
  </script>
<script async="" defer="" id="gsi-client" nonce="wXHGw2J4q0APeqx8Ri51bw==" src="https://accounts.google.com/gsi/client

In [95]:
dataset_metadata = []
for script_element in bs.select('script'):
    try:
        metadata = json.loads(script_element.string)
        dataset_metadata.append(metadata)
    except Exception as e:
        continue

In [88]:
dataset_metadata

[{'@context': 'http://schema.org/',
  '@type': 'Dataset',
  'name': 'Netflix Movies and TV Shows',
  'description': "Netflix stands as a leading force in the realm of media and video streaming. With a staggering array of over 8,000 movies and TV shows accessible on their platform, as of mid-2021, their global subscriber count exceeds 200 million. This tabulated dataset comprehensively catalogues all offerings on Netflix, including vital details such as cast, directors, ratings, release year, duration, and more.\n\n# Dataset Overview:\nThe Netflix Titles dataset is a comprehensive compilation of movies and TV shows available on Netflix, covering various aspects such as the title type, director, cast, country of production, release year, rating, duration, genres (listed in), and a brief description. This dataset is instrumental for analyzing trends in Netflix content, understanding genre popularity, and examining the distribution of content across different regions and time periods.\n\n#

In [90]:
dataset_metadata[0].keys()

dict_keys(['@context', '@type', 'name', 'description', 'url', 'version', 'keywords', 'license', 'identifier', 'includedInDataCatalog', 'creator', 'distribution', 'commentCount', 'dateModified', 'discussionUrl', 'alternateName', 'isAccessibleForFree', 'thumbnailUrl', 'interactionStatistic'])

In [96]:
dataset_metadata[0]["name"]

'Netflix Movies and TV Shows'

In [92]:
dataset_metadata[0]["description"]

"Netflix stands as a leading force in the realm of media and video streaming. With a staggering array of over 8,000 movies and TV shows accessible on their platform, as of mid-2021, their global subscriber count exceeds 200 million. This tabulated dataset comprehensively catalogues all offerings on Netflix, including vital details such as cast, directors, ratings, release year, duration, and more.\n\n# Dataset Overview:\nThe Netflix Titles dataset is a comprehensive compilation of movies and TV shows available on Netflix, covering various aspects such as the title type, director, cast, country of production, release year, rating, duration, genres (listed in), and a brief description. This dataset is instrumental for analyzing trends in Netflix content, understanding genre popularity, and examining the distribution of content across different regions and time periods.\n\n## Key Details:\n- **Total Entries:** The dataset contains 8,809 entries, each representing a unique movie or TV show

In [93]:
dataset_metadata[0]["interactionStatistic"]

[{'@type': 'InteractionCounter',
  'interactionType': 'http://schema.org/CommentAction',
  'userInteractionCount': 1},
 {'@type': 'InteractionCounter',
  'interactionType': 'http://schema.org/DownloadAction',
  'userInteractionCount': 12892},
 {'@type': 'InteractionCounter',
  'interactionType': 'http://schema.org/ViewAction',
  'userInteractionCount': 48737},
 {'@type': 'InteractionCounter',
  'interactionType': 'http://schema.org/LikeAction',
  'userInteractionCount': 271}]

In [None]:
dataset_metadata[0]["interactionStatistic"]

In [94]:
dataset_metadata[0]["keywords"]

['subject, arts and entertainment',
 'subject, arts and entertainment, movies and tv shows',
 'technique, data visualization',
 'technique, exploratory data analysis',
 'subject, science and technology, computer science, artificial intelligence']

In [None]:
## Extract metadata 

In [134]:
load_dotenv()

True

In [135]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)

In [144]:
def generate_task_keyword_queries(text: str):
    response = client.chat.completions.create(
        model="gpt-4-infer-model",
        messages = [
          {
              "role":"system",
              "content":
              """
              For the provided text, extract a dictionary with two keys "keywords" and "tasks" with the following content:
              keywords: a list of 5 keywords reflecting the contents of the dataset (if a dataset description is provided),
              tasks: a list of 3 analytics tasks that can be performed with the described dataset, e.g. develop ML model to predict XYZ.
              """
          },
          {
              "role":"user",
               "content":f"The dataset description string is: {text}"
          }
        ],
        temperature=0,
        max_tokens=200,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
    )

    task_keyword_queries = eval(response.to_dict()["choices"][0]["message"]["content"])

    return task_keyword_queries

In [145]:
generate_task_keyword_queries(dataset_metadata[0]["description"])

{'keywords': ['Netflix Titles dataset',
  'content analysis',
  'recommendation systems',
  'market analysis',
  'streaming entertainment'],
 'tasks': ['Analyze genre popularity over time',
  'Develop a recommendation system based on content similarity and user preference',
  "Analyze Netflix's content strategy focusing on international markets, genre diversification, and investment in original content"]}