# Part 1 – CLUSTER ANALYSIS.



In [4]:
import pandas as pd
import numpy as np
import os
from scipy.cluster import hierarchy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

In [5]:
# Read dataset and print first 5 rows
data = pd.read_csv("imdb_dataset.csv", encoding="latin")
df=data
data.head()


Unnamed: 0.1,Unnamed: 0,title,title_type,genre,runtime,mpaa_rating,studio,thtr_rel_year,thtr_rel_month,thtr_rel_day,...,best_dir_win,top200_box,director,actor1,actor2,actor3,actor4,actor5,imdb_url,rt_url
0,1,Filly Brown,Feature Film,Drama,80.0,R,Indomina Media Inc.,2013,4,19,...,no,no,Michael D. Olmos,Gina Rodriguez,Jenni Rivera,Lou Diamond Phillips,Emilio Rivera,Joseph Julian Soria,http://www.imdb.com/title/tt1869425/,//www.rottentomatoes.com/m/filly_brown_2012/
1,2,The Dish,Feature Film,Drama,101.0,PG-13,Warner Bros. Pictures,2001,3,14,...,no,no,Rob Sitch,Sam Neill,Kevin Harrington,Patrick Warburton,Tom Long,Genevieve Mooy,http://www.imdb.com/title/tt0205873/,//www.rottentomatoes.com/m/dish/
2,3,Waiting for Guffman,Feature Film,Comedy,84.0,R,Sony Pictures Classics,1996,8,21,...,no,no,Christopher Guest,Christopher Guest,Catherine O'Hara,Parker Posey,Eugene Levy,Bob Balaban,http://www.imdb.com/title/tt0118111/,//www.rottentomatoes.com/m/waiting_for_guffman/
3,4,The Age of Innocence,Feature Film,Drama,139.0,PG,Columbia Pictures,1993,10,1,...,yes,no,Martin Scorsese,Daniel Day-Lewis,Michelle Pfeiffer,Winona Ryder,Richard E. Grant,Alec McCowen,http://www.imdb.com/title/tt0106226/,//www.rottentomatoes.com/m/age_of_innocence/
4,5,Malevolence,Feature Film,Horror,90.0,R,Anchor Bay Entertainment,2004,9,10,...,no,no,Stevan Mena,Samantha Dark,R. Brandon Johnson,Brandon Johnson,Heather Magee,Richard Glover,http://www.imdb.com/title/tt0388230/,//www.rottentomatoes.com/m/10004684-malevolence/


In [6]:
# Remove unneeded columns and print the new modified DF
dropped = ['Unnamed', 'thtr_rel_month', 'thtr_rel_day', 'best_dir_win', 'top200_box']
df = df.drop(columns=[col for col in dropped if col in df.columns])
print(df)


     Unnamed: 0                        title    title_type  \
0             1                  Filly Brown  Feature Film   
1             2                     The Dish  Feature Film   
2             3          Waiting for Guffman  Feature Film   
3             4         The Age of Innocence  Feature Film   
4             5                  Malevolence  Feature Film   
..          ...                          ...           ...   
646         647           Death Defying Acts  Feature Film   
647         648                   Half Baked  Feature Film   
648         649            Dance of the Dead  Feature Film   
649         650  Around the World in 80 Days  Feature Film   
650         651                          LOL  Feature Film   

                  genre  runtime mpaa_rating                    studio  \
0                 Drama     80.0           R       Indomina Media Inc.   
1                 Drama    101.0       PG-13     Warner Bros. Pictures   
2                Comedy     84.0 

In [None]:
# Use 'title' column for text analysis
titles = data['title']

# Convert text data to TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(titles)

# Determine the optimal number of clusters using the elbow method
num_clusters_range = range(1, 11)
sse_values = [] # SEE = Sum of Squared Errors

# Loop over the range to calculate SSE for each number of clusters
for num_clusters in num_clusters_range:
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(tfidf_matrix)
    sse_values.append(kmeans.inertia_)

# Plot the elbow graph
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(num_clusters_range, sse_values, marker='o')
plt.title('Optimal K using Elbow Method')
plt.xlabel('# of Clusters')
plt.ylabel('Sum of Squared Errors (SSE)')

# Analyze SSE specific number of clusters
specific_num_clusters = [1, 2, 3, 4, 5, 6]
specific_sse_values = []

for num_clusters in specific_num_clusters:
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(tfidf_matrix)
    specific_sse_values.append(kmeans.inertia_)

plt.subplot(1, 2, 2)
plt.plot(specific_num_clusters, specific_sse_values, marker='o', color='orange')
plt.title('SSE for Specific Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Errors (SSE)')

plt.tight_layout()
plt.show()

In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_tfidf)



#1.4. For hierarchical clustering, apply single, complete and average link and display the
#dendogram (the plot that visualizes the hierarchy).

linkage_methods = ['single', 'complete', 'average']
for linkage_method in linkage_methods:
    # Create linkage matrix and perform hierarchical clustering
    linkage_matrix = linkage(X_tfidf.toarray(), method=linkage_method)
    plt.figure(figsize=(12, 8))
    dendrogram(linkage_matrix, orientation='top', labels=X.tolist(), distance_sort='descending', show_leaf_counts=True)
    plt.title(f'Hierarchical Clustering Dendrogram ({linkage_method} Link)')
    plt.xlabel('Movie Index')
    plt.ylabel('Distance')
    plt.show()

# Adding KMeans cluster labels to the dataframe & print first few rows    
df['KMeans_Cluster'] = kmeans_labels
print("KMeans  Results:")
print(df[['title', 'KMeans_Cluster']].head())

In [None]:
# Preprocessing numerical columns for hierarchical clustering
numCol = df.select_dtypes(include=['number'])
numCol = numCol.fillna(numCol.mean())
numCol.replace([np.inf, -np.inf], np.nan, inplace=True)
numCol = numCol.dropna()

# Performing hierarchical clustering using 'single' linkage and plotting dendrogram
Z = hierarchy.linkage(numCol.values, 'single')
dn = hierarchy.dendrogram(Z, labels=df['title'].tolist(), orientation='right', truncate_mode='lastp', p=30)
plt.show()

In [None]:
# Performing hierarchical clustering using 'complete' linkage and plotting dendrogram
Z = hierarchy.linkage(numCol.values, 'complete')
dn = hierarchy.dendrogram(Z, labels=df['title'].tolist(), orientation='right', truncate_mode='lastp', p=30)

# Performing hierarchical clustering using 'average' linkage and plotting dendrogram
Z = hierarchy.linkage(numCol.values, 'average')
dn = hierarchy.dendrogram(Z, labels=df['title'].tolist(), orientation='right', truncate_mode='lastp', p=30)

# Part 2 – TEXT MINING

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Create DF from sample text data for text mining
text_data = [
    'Now for manners use has company believe parlors.',
    'Least nor party who wrote while did. Excuse formed as is agreed admire so on result parish.',
    'Put use set uncommonly announcing and travelling. Allowance sweetness direction to as necessary.',
    'Principle oh explained excellent do my suspected conveying in.',
    'Excellent you did therefore perfectly supposing described. ',
    'Its had resolving otherwise she contented therefore.',
    'Afford relied warmth out sir hearts sister use garden.',
    'Men day warmth formed admire former simple.',
    'Humanity declared vicinity continue supplied no an. He hastened am no property exercise of.',
    'Dissimilar comparison no terminated devonshire no literature on. Say most yet head room such just easy.'
]

df = pd.DataFrame({'Text': text_data})

# 2.3. Your task is to create a count vector and a tfidf vector on the given data (refer to 2.
#In the resources below)

# Initializing a CountVectorizer and transforming the text data into count vectors
count_vectorizer = CountVectorizer()
count_vector = count_vectorizer.fit_transform(text_data)
tfidf_vectorizer = TfidfVectorizer()
tfidf_vector = tfidf_vectorizer.fit_transform(text_data)

# 2.4. Display the count vector and tfidf vector and explain the usage of tfidf.
print("Count V:")
print(pd.DataFrame(count_vector.toarray(), columns=count_vectorizer.get_feature_names_out()))
print("\nTF-IDF V")
print(pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))


2.4  explain the usage of tfidf.

Term Frequency-Inverse Document Frequency, measures a word's significance in a document compared to a corpus. It combines term frequency (word occurrence) and inverse document frequency (uniqueness across the corpus). Widely used in text mining and information retrieval, TF-IDF helps identify important words in documents for various natural language processing (NLP) applications.

# Part 3 - ANN 

# Part 4 - Report w/ Summarized Findings
### [In Progress] Halfway done, waiting for all parts to be done to summarize

In [5]:
# Opening and displaying the PDF to the report (instructions say to make PDF)
from IPython.display import IFrame
pdf_path = 'A4 Part 4.pdf'
IFrame(pdf_path, width=800, height=600)
