In [24]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter


def fetch_imdb_reviews(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    review_elements = soup.find_all('div', class_='text show-more__control')
    reviews = [review_element.text.strip() for review_element in review_elements]
    return reviews

# IMDb URL for reviews
imdb_url = 'https://www.imdb.com/title/tt0126029/reviews'

# Fetch IMDb reviews
imdb_reviews = fetch_imdb_reviews(imdb_url)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english')

# Fit and transform the reviews
X = vectorizer.fit_transform(imdb_reviews)

# Apply K-means clustering
num_clusters = 15  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X)

# Print the top terms per cluster
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
top_words_all_clusters = []  # Define top_words_all_clusters list here
for i in range(num_clusters):
    print(f"Cluster {i}:")
    top_words_cluster = [terms[ind] for ind in order_centroids[i, :10]]
    print(", ".join(top_words_cluster))
    top_words_all_clusters.extend(top_words_cluster)  # Append cluster words to top_words_all_clusters

# Find the top 10 words across all clusters
top_words_combined = Counter(top_words_all_clusters).most_common(10)
print("\nTop 10 words across all clusters:")
for word, count in top_words_combined:
    print(f"{word}: {count}")

for i, review in enumerate(imdb_reviews):
    print(f'Review {i+1}: {review}')


Top terms per cluster:
Cluster 0:
fun, tale, fairy, great, humor, animated, does, time, film, gingerbread
Cluster 1:
shrek, farquaad, swamp, lord, fiona, especially, does, ogre, john, eddie
Cluster 2:
shrek, voiced, fairytale, film, just, wonderful, story, audience, fun, 10
Cluster 3:
funny, wonderful, real, computer, movies, way, good, characters, story, great
Cluster 4:
movie, good, shrek, movies, really, like, animated, dragon, characters, plot
Cluster 5:
film, great, movies, animation, story, say, head, ll, better, plot
Cluster 6:
shrek, donkey, story, jokes, love, tale, fiona, does, know, animation
Cluster 7:
love, shows, audience, doesn, really, funny, characters, story, movie, great
Cluster 8:
ll, head, world, know, little, make, fun, say, kind, way
Cluster 9:
10, dragon, especially, fairy, fairytale, farquaad, film, films, fiona, fun
Cluster 10:
perfect, way, voice, really, fiona, cameron, diaz, movie, shrek, beautiful
Cluster 11:
princess, shrek, beautiful, perfect, doesn, ani

In [28]:
for i, review in enumerate(imdb_reviews):
    print(f'Review {i+1}: {review}')

Review 1: For younger viewers, they might not understand the impact of "Shrek" on the film industry and CGI movies. However, being an older guy, I remember how groundbreaking and amazing the computer animation was when the film debuted. It was head and shoulders better than the few CGI movies that had come out in the previous decade...and it finally gave Pixar a serious run for the money. In addition, the plot was very adult...and the film's appeal was much greater than a typical CGI story. Great animation, great story telling and great voice acting, while the movie isn't quite impressive to see today, it still holds up very well and is well worth your time.I could easily say more but this film has over a thousand reviews....so I'll just end it here!
Review 2: Shrek is a wonderful parody of every fairytale story that you can think of, for the kids it has charm, humour and tons of fun, and is a delightful film for them to enjoy. In addition, Shrek has plenty of subtle innuendo and more 

In [None]:
for i, review in enumerate(imdb_reviews):
    print(f'Review {i+1}: {review}')
