## Section 1: Importing Libraries and Loading Data

We will start by importing the necessary libraries and loading our dataset. This dataset contains information about chemistry YouTube videos.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from umap import UMAP
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import MinMaxScaler
import openai
from dotenv import load_dotenv
import os
from scipy.spatial import ConvexHull, distance
from sklearn.metrics.pairwise import euclidean_distances

# Load OpenAI API key from environment variable for security
load_dotenv()
api_key = os.getenv("apikey")
openai.api_key = api_key

# Load the dataset
data = pd.read_csv('chemistry2.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


## Section 2: Embedding Generation

We will use OpenAI's API to generate embeddings for the text data. Embeddings are vector representations of text that capture semantic relationships.


In [None]:
def get_embeddings_batch(inputs):
    embeddings = []
    for input_text in inputs:
        response = openai.Embedding.create(
            input=input_text,
            model="text-embedding-3-large"
        )
        embedding = np.array(response['data'][0]['embedding'])
        embeddings.append(embedding)
    return np.array(embeddings)

# Generate embeddings for the text data
final_array = data['Title'] + " " + data['Description'] + " " + data['Transcript']
embeddings = get_embeddings_batch(final_array.tolist())


## Section 3: Cosine Similarity

Cosine similarity is used to measure the similarity between two vectors. It is commonly used in text analysis to compare document embeddings.


In [None]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

# Example: Calculate cosine similarity between the first two embeddings
similarity = cosine_similarity(embeddings[0], embeddings[1])
print(f"Cosine Similarity between first two embeddings: {similarity}")


## Section 4: Data Normalization

We will normalize the view counts and transcript lengths to ensure they are on a comparable scale.


In [None]:
# Normalize view counts
data['ViewCount'] = data['ViewCount'].str.replace(',', '').astype(float)
scaler_view = MinMaxScaler(feature_range=(5, 60))
data['NormalizedViewCount'] = scaler_view.fit_transform(data[['ViewCount']])

# Normalize transcript lengths
data['TranscriptLength'] = data['Transcript'].apply(lambda x: len(str(x).split()))
scaler_transcript = MinMaxScaler(feature_range=(1, 10))
data['NormalizedTranscriptLength'] = scaler_transcript.fit_transform(data[['TranscriptLength']])

# Summary of normalized values
print("Summary of Normalized View Counts:")
print(data['NormalizedViewCount'].describe())
print("\nSummary of Normalized Transcript Lengths:")
print(data['NormalizedTranscriptLength'].describe())


## Section 5: UMAP Visualization

UMAP (Uniform Manifold Approximation and Projection) is a technique used for dimensionality reduction and visualization. We will use UMAP to visualize the embeddings in 2D space.


In [None]:
# Apply UMAP to reduce the embeddings to 2D
umap_model = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine')
umap_coords = umap_model.fit_transform(embeddings)

# Add UMAP coordinates to the dataframe
data['x'] = umap_coords[:, 0]
data['y'] = umap_coords[:, 1]

# Plot UMAP
plt.figure(figsize=(10, 7))
plt.scatter(data['x'], data['y'], c=data['NormalizedViewCount'], cmap='viridis', s=50, alpha=0.7)
plt.colorbar(label='Normalized View Count')
plt.title('UMAP Projection of Embeddings')
plt.xlabel('UMAP x-coordinate')
plt.ylabel('UMAP y-coordinate')
plt.show()


## Section 6: K-means Clustering

K-means clustering is an unsupervised learning algorithm used to group data points into clusters. We will apply K-means clustering to the embeddings and visualize the results.


In [None]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
data['cluster'] = kmeans.fit_predict(embeddings)

# Plot UMAP with clusters
plt.figure(figsize=(10, 7))
plt.scatter(data['x'], data['y'], c=data['cluster'], cmap='tab10', s=50, alpha=0.7)
plt.colorbar(label='Cluster')
plt.title('UMAP Projection with K-means Clusters')
plt.xlabel('UMAP x-coordinate')
plt.ylabel('UMAP y-coordinate')
plt.show()


## Section 7: Minimum Spanning Tree (MST)

Minimum Spanning Tree (MST) is used to connect all points in a graph with the minimum possible total edge weight. We will create MSTs for each cluster and visualize them.


In [None]:
# Create MST for each cluster
def create_minimum_spanning_tree(cluster_data):
    cluster_coords = cluster_data[['x', 'y']].values
    if len(cluster_coords) > 1:
        pairwise_distances = euclidean_distances(cluster_coords)
        mst = minimum_spanning_tree(pairwise_distances)
        edges = mst.nonzero()
        return edges
    return None

# Plot MSTs on UMAP
plt.figure(figsize=(15, 10))
unique_clusters = data['cluster'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_clusters)))

for cluster_id, color in zip(unique_clusters, colors):
    cluster_data = data[data['cluster'] == cluster_id]
    plt.scatter(cluster_data['x'], cluster_data['y'], c=[color], s=50, alpha=0.7, label=f'Cluster {cluster_id}')
    edges = create_minimum_spanning_tree(cluster_data)
    if edges is not None:
        for start, end in zip(edges[0], edges[1]):
            plt.plot([cluster_data.iloc[start]['x'], cluster_data.iloc[end]['x']],
                     [cluster_data.iloc[start]['y'], cluster_data.iloc[end]['y']],
                     c=color, alpha=0.5, linewidth=1.5)

plt.title('UMAP Projection with K-means Clusters and MST')
plt.xlabel('UMAP x-coordinate')
plt.ylabel('UMAP y-coordinate')
plt.legend()
plt.show()
