In [3]:
import pandas as pd
from transformers import AutoTokenizer, GPT2LMHeadModel, pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

In [4]:
!pip install datasets
!pip install langchain
!pip install ctransformers
!pip install nltk
!pip install faiss-cpu
!pip install faiss-gpu

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [6]:
from datasets import load_dataset
dataset = load_dataset("hugginglearners/netflix-shows", split="train")
df = pd.DataFrame(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.40M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [8]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['description'])

In [9]:
# Clustering with KMeans
num_clusters = 15
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster_label'] = kmeans.fit_predict(X_tfidf)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[['description', 'cluster_label']], df['listed_in'], test_size=0.2, random_state=42)

# Enhance feature space with cluster labels
X_train_tfidf = vectorizer.transform(X_train['description'])
X_test_tfidf = vectorizer.transform(X_test['description'])



In [17]:
#Method 1: Generating decriptions using tokenization and then determining whether they fit the model
from transformers import AutoTokenizer, GPT2LMHeadModel, pipeline
from transformers import GPT2Tokenizer

# Logistic Regression to refine clustering
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
df['predicted_genre'] = clf.predict(vectorizer.transform(df['description']))

# Generate descriptions using GPT-2
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [19]:
#Method 2: Generating buckets, printing decriptions, then finding accuracies of all the buckets
# Create genre buckets
genre_lists = df['listed_in'].str.split(", ")
all_genres = set()
for genres in genre_lists:
    all_genres.update(genres)
bucket_queue = {genre: [] for genre in all_genres}

In [20]:
def generate_descriptions(genres, num_descriptions=3):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

    genre_descriptions = {}
    for genre in genres:
        prompt = f"The genre is {genre}. Describe a typical {genre} TV show: "
        descriptions = generator(prompt, max_length=50, num_return_sequences=num_descriptions)
        genre_descriptions[genre] = [desc['generated_text'] for desc in descriptions]

    return genre_descriptions

genre_descriptions = generate_descriptions(all_genres)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [39]:
def print_genre_descriptions(genre_descriptions):
    print("Generated Descriptions by Genre:\n")
    for genre, descriptions in genre_descriptions.items():
        print(f"Genre: {genre}")
        for i, description in enumerate(descriptions, start=1):
            print(f"  Description {i}: {description.strip()}")
        print("\n")
print_genre_descriptions(genre_descriptions)


Generated Descriptions by Genre:

Genre: TV Dramas
  Description 1: The genre is TV Dramas. Describe a typical TV Dramas TV show:  (a) A group of very young teenagers that come together in a city that has seen an awful lot of riots.  (b) A mob
  Description 2: The genre is TV Dramas. Describe a typical TV Dramas TV show:  Most of its shows are written by people who've never watched a TV show before, so they're familiar with a lot of tropes from the original shows (
  Description 3: The genre is TV Dramas. Describe a typical TV Dramas TV show:  episode 1 (9 episodes).  2 (12 episodes) In-House TV Episode (9 episodes)
Video Games:  P


Genre: Crime TV Shows
  Description 1: The genre is Crime TV Shows. Describe a typical Crime TV Shows TV show:  You'll also appreciate the different genres including  Crime   TV,  Casual TV,  Arrested Development,  Action,  TV Originals,
  Description 2: The genre is Crime TV Shows. Describe a typical Crime TV Shows TV show:  Criminal Minds, Crime Action-A

In [27]:
!pip install fuzzywuzzy


[31mERROR: Could not find a version that satisfies the requirement as (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for as[0m[31m
[0m

In [24]:
def determine_closest_genre(desc, genre_desc_map):
    print("Determining closest genre...")
    highest_similarity = 0
    closest_genre = None
    for genre, descs in genre_desc_map.items():
        for gen_desc in descs:
            sim_score = fuzz.token_sort_ratio(desc.lower(), gen_desc.lower())
            if sim_score > highest_similarity:
                highest_similarity = sim_score
                closest_genre = genre
    return closest_genre


In [34]:
for idx, row in df.iterrows():
    assigned_genre = determine_closest_genre(row['description'], genre_descriptions)
    df.at[idx, 'generated_genre'] = assigned_genre

Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...
Determining closest genre...


KeyboardInterrupt: 

In [36]:
print("Evaluating accuracy...")
accuracy_dict = defaultdict(int)
count_dict = defaultdict(int)

for idx, row in df.iterrows():
    real_genres = row['listed_in'].split(", ")
    if row['generated_genre'] in real_genres:
        accuracy_dict[row['generated_genre']] += 1
    for genre in real_genres:
        count_dict[genre] += 1

for genre in count_dict:
    if count_dict[genre] > 0:
        print(f"Accuracy for {genre}: {accuracy_dict[genre] / count_dict[genre]:.2f}")

Evaluating accuracy...
Accuracy for Documentaries: 0.02
Accuracy for International TV Shows: 0.00
Accuracy for TV Dramas: 0.04
Accuracy for TV Mysteries: 0.00
Accuracy for Crime TV Shows: 0.01
Accuracy for TV Action & Adventure: 0.01
Accuracy for Docuseries: 0.09
Accuracy for Reality TV: 0.01
Accuracy for Romantic TV Shows: 0.00
Accuracy for TV Comedies: 0.08
Accuracy for TV Horror: 0.05
Accuracy for Children & Family Movies: 0.00
Accuracy for Dramas: 0.02
Accuracy for Independent Movies: 0.01
Accuracy for International Movies: 0.00
Accuracy for British TV Shows: 0.00
Accuracy for Comedies: 0.02
Accuracy for Spanish-Language TV Shows: 0.02
Accuracy for Thrillers: 0.04
Accuracy for Romantic Movies: 0.04
Accuracy for Music & Musicals: 0.02
Accuracy for Horror Movies: 0.04
Accuracy for Sci-Fi & Fantasy: 0.02
Accuracy for TV Thrillers: 0.00
Accuracy for Kids' TV: 0.05
Accuracy for Action & Adventure: 0.01
Accuracy for TV Sci-Fi & Fantasy: 0.04
Accuracy for Classic Movies: 0.01
Accuracy for

In [37]:
#print("Re-assigning genres based on primary genre match...")
for idx, row in df.iterrows():
    main_genre = row['listed_in'].split(', ')[0]
    if row['generated_genre'] != main_genre:
        df.at[idx, 'corrected_genre'] = main_genre
    else:
        df.at[idx, 'corrected_genre'] = row['generated_genre']