In [9]:
import numpy as np
import pandas as pd
import tkinter as tk
from tkinter import filedialog, messagebox


In [10]:

data = pd.read_csv("imdb_top_2000_movies.csv")
data.drop(columns=["Release Year", "Metascore", "Votes", "Genre", "Gross", "Cast", "Director"], inplace=True)


*Removing Outliers Before Start K-Mean Because It's Sensentive To Outliers*

In [11]:
def Outliers_IQR(data):
    Q1 = np.percentile(data[ "Duration"], 25)
    Q3 = np.percentile(data[ "Duration"], 75)
    IQR = Q3 - Q1
    LowerBound = Q1 - 1.5 * IQR
    UpperBound = Q3 + 1.5 * IQR
    OutliersIQR = (data[ "Duration"] < LowerBound) | (data[ "Duration"] > UpperBound)
    Outliers = data[OutliersIQR]
    print(F"({len(Outliers)} movies)")
    Data_Without_Outliers = data[~OutliersIQR]
    return Data_Without_Outliers,Outliers
    

**Calculate Distance**

In [12]:
def euclidean_distance(point, centroid):
    return np.sqrt(np.sum((point - centroid) ** 2))


**Get The Initial CENTROIDS**

In [13]:
def Initialize_Centroids(data, num_clusters):
    imdb_ratings = data[ "Duration"]
    movie_names = data['Movie Name']
    
    centroids_idx = np.random.choice(len(imdb_ratings), size=num_clusters, replace=False)
    centroids = imdb_ratings.iloc[centroids_idx].values
    centroid_movie_names = movie_names.iloc[centroids_idx].values
    
    return centroids, centroid_movie_names

*Get The Nearest Movie To The Centroid*

In [14]:
def assign_to_clusters(data, centroids):
    movie_names = []
    imdb_ratings = []
    cluster_assignments = []

    for index, row in data.iterrows():
        movie = row['Movie Name']
        rating = row['Duration']

        movie_names.append(movie)
        imdb_ratings.append(rating)

        distances = [euclidean_distance(rating, centroid) for centroid in centroids]

        nearest_centroid_index = np.argmin(distances)

        cluster_assignments.append(nearest_centroid_index)

    df = pd.DataFrame({
        'Movie Name': movie_names,
        'Duration': imdb_ratings,
        'Cluster': cluster_assignments
    })

    return df


**K-MEAN ALGORITHM**

In [15]:
def k_means(data, num_clusters, max_iterations=100):
    centroids, _ = Initialize_Centroids(data, num_clusters)
    
    for iteration in range(max_iterations):
        assigned_clusters = assign_to_clusters(data, centroids)
        
        new_centroids = []
        cluster_movie_names = [[] for _ in range(num_clusters)] 
        
        for i in range(num_clusters):
            cluster_data = assigned_clusters[assigned_clusters['Cluster'] == i]
            if len(cluster_data) > 0:
                new_centroid = cluster_data[ "Duration"].mean()
                cluster_movie_names[i] = cluster_data['Movie Name'].tolist()  
            else:
                new_centroid = centroids[i]
            new_centroids.append(new_centroid)
        
        new_centroids = np.array(new_centroids)
        
        if np.array_equal(centroids, new_centroids):
            break
        
        print(f"Iteration {iteration + 1}:")
        print("Old Centroids:", centroids)
        print("New Centroids:", new_centroids)
        
        centroids = new_centroids
    
    assigned_clusters['Movie Names'] = [cluster_movie_names[c] for c in assigned_clusters['Cluster']]
    
    return centroids, assigned_clusters


In [16]:
data_without_outliers, _ = Outliers_IQR(data)
num_clusters = 3  # Set the number of clusters
final_centroids, final_clusters = k_means(data_without_outliers, num_clusters)
InitializeCentroids, _ =Initialize_Centroids(data_without_outliers, num_clusters)
print(InitializeCentroids,_)
print("Final Centroids:")
print(final_centroids)
print("\nFinal Clusters:")
print(final_clusters)




(72 movies)
Iteration 1:
Old Centroids: [ 90  88 112]
New Centroids: [ 95.42043222  83.5862069  120.77708006]
Iteration 2:
Old Centroids: [ 95.42043222  83.5862069  120.77708006]
New Centroids: [ 99.2242268   84.35502959 125.43743642]
Iteration 3:
Old Centroids: [ 99.2242268   84.35502959 125.43743642]
New Centroids: [101.86492891  86.20987654 127.98573127]
Iteration 4:
Old Centroids: [101.86492891  86.20987654 127.98573127]
New Centroids: [104.25647349  88.39215686 129.54078947]
Iteration 5:
Old Centroids: [104.25647349  88.39215686 129.54078947]
New Centroids: [106.1625      89.75227273 131.01453488]
Iteration 6:
Old Centroids: [106.1625      89.75227273 131.01453488]
New Centroids: [107.62181818  90.42474227 132.54854369]
Iteration 7:
Old Centroids: [107.62181818  90.42474227 132.54854369]
New Centroids: [109.39778325  91.51871658 134.02882883]
Iteration 8:
Old Centroids: [109.39778325  91.51871658 134.02882883]
New Centroids: [110.43125     92.16144975 134.87907869]
Iteration 9:
Ol

In [17]:
def load_and_process_data(filepath, percentage):
    try:
        data = pd.read_csv(filepath)
        num_records = int(len(data) * (percentage / 100))
        data = data.head(num_records)

        # Data Preprocessing
        data.drop(columns=["Release Year", "Metascore", "Votes", "Genre", "Gross", "Cast", "Director"], inplace=True)

        # Removing Outliers
        data, outliers = Outliers_IQR(data)

        return data, outliers
    except Exception as e:
        messagebox.showerror("Error", str(e))
        return None, None

def load_data():
    filepath = filedialog.askopenfilename(title="Select File")
    if filepath:
        data_entry.delete(0, tk.END)
        data_entry.insert(0, filepath)

def run_clustering():
    filepath = data_entry.get()
    percentage = percentage_entry.get()
    num_clusters = int(clusters_entry.get())

    try:
        percentage = float(percentage)
        if percentage <= 0 or percentage > 100:
            raise ValueError("Percentage must be between 0 and 100")
    except ValueError:
        messagebox.showerror("Error", "Invalid percentage value")
        return

    data, outliers = load_and_process_data(filepath, percentage)

    if data is None:
        return

    # Use the preprocessed data without outliers
    data_without_outliers, _ = Outliers_IQR(data)
    # print("Data without outliers:")
    # print(data_without_outliers)

    initial_centroids, _ = Initialize_Centroids(data_without_outliers, num_clusters)

    result_text.delete("1.0", tk.END)
    result_text.insert(tk.END, "Initial centroids:\n")
    for centroid in initial_centroids:
        result_text.insert(tk.END, f"- Centroid: {centroid}\n")
    result_text.insert(tk.END, "\n")

    final_centroids, clusters = k_means(data_without_outliers, num_clusters)

    result_text.insert(tk.END, "Final centroids:\n")
    for i, centroid in enumerate(final_centroids):
        result_text.insert(tk.END, f"- Centroid {i+1}: {centroid}\n")
    result_text.insert(tk.END, "\n\n")

    for i, (_, cluster) in enumerate(clusters.groupby('Cluster')):
            movie_list = "\n - ".join(cluster['Movie Names'].iloc[0])
            result_text.insert(tk.END, f"\n\nCluster {i+1} Movies ({len(cluster)} movies) : \n - {movie_list}\n")

    result_text.insert(tk.END, "\nOutliers:\n")
    for index, row in outliers.iterrows():
            result_text.insert(tk.END, f"- {row['Movie Name']}: {row[ "Duration"]}\n")
   
    # total_movies_in_clusters = sum(len(cluster) for _, cluster in clusters.groupby('Cluster'))

    # for i, (_, cluster) in enumerate(clusters.groupby('Cluster')):
    #     movie_list = "\n - ".join(cluster['Movie Names'].iloc[0])
    #     result_text.insert(tk.END, f"Cluster {i+1} Movies ({len(cluster)} movies) : \n - {movie_list}\n")

    # result_text.insert(tk.END, f"\nTotal movies in all clusters: {total_movies_in_clusters}")



# GUI setup
root = tk.Tk()
root.title("K-Means Clustering")

# File Selection
file_frame = tk.Frame(root)
file_frame.pack(pady=10)

tk.Label(file_frame, text="Select File:").pack(side=tk.LEFT)
data_entry = tk.Entry(file_frame, width=50)
data_entry.pack(side=tk.LEFT, padx=10)
tk.Button(file_frame, text="Browse", command=load_data).pack(side=tk.LEFT)

# Percentage of Data
percentage_frame = tk.Frame(root)
percentage_frame.pack(pady=10)

tk.Label(percentage_frame, text="Percentage of Data to Read:").pack(side=tk.LEFT)
percentage_entry = tk.Entry(percentage_frame, width=10)
percentage_entry.pack(side=tk.LEFT, padx=10)
tk.Label(percentage_frame, text="%").pack(side=tk.LEFT)

# Number of Clusters
clusters_frame = tk.Frame(root)
clusters_frame.pack(pady=10)

tk.Label(clusters_frame, text="Number of Clusters (K):").pack(side=tk.LEFT)
clusters_entry = tk.Entry(clusters_frame, width=10)
clusters_entry.pack(side=tk.LEFT, padx=10)

# Run Button
run_button = tk.Button(root, text="Run Clustering", command=run_clustering)
run_button.pack(pady=10)

# Results Display
result_text = tk.Text(root, height=20, width=80)
result_text.pack()

root.mainloop()


