### Dataset and imports

In [None]:
import pandas as pd
import numpy as np
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
full_df = pd.read_csv(os.path.join('datasets', 'balancedLength_lexFridman_resto.csv'))
speaker_series = full_df.groupby('speaker')['text'].apply(lambda x: ''.join(x))

In [None]:
corpus_fridman = speaker_series["LEXFRIDMAN"]
corpus_resto = speaker_series["RESTO"]

In [None]:
print(sum(full_df['speaker'] == 'LEXFRIDMAN'))
print(sum(full_df['speaker'] == 'RESTO'))

### COSINE SIMILARITY

Generate random sentences from the dataset

In [None]:
#Get random sentences from both. Execute until satisfied with choice
random_sent_fridman = np.random.choice(full_df[full_df['speaker'] == 'LEXFRIDMAN']['text'].to_list())
random_sent_resto = np.random.choice(full_df[full_df['speaker'] == 'RESTO']['text'].to_list())
print("LEXFRIDMAN:", random_sent_fridman)
print("RESTO:", random_sent_resto)

#### Get similarity with Word2Vec

In [None]:
# Preprocessing (assuming the model was trained on lowercased text)
def preprocess(text):
    return [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]

In [None]:
from gensim.models import Word2Vec

# Load the pre-trained Word2Vec model
model = Word2Vec.load('podcast_Word2Vec')

In [None]:
# Convert text to vectors
def text_to_vector(text):
    words = preprocess(text)
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(model.vector_size)

corpus_fridman_vec = text_to_vector(corpus_fridman)
corpus_resto_vec = text_to_vector(corpus_resto)

In [None]:
def who_said_this(sentence : str, corpus_fridman_vec, corpus_resto_vec):

    sentence_vec = text_to_vector(sentence)

    corpus_fridman_sim = cosine_similarity(corpus_fridman_vec.reshape(1, -1), sentence_vec.reshape(1, -1))
    corpus_resto_sim = cosine_similarity(corpus_resto_vec.reshape(1, -1), sentence_vec.reshape(1, -1))

    return 'LEXFRIDMAN' if corpus_fridman_sim > corpus_resto_sim else 'RESTO'

In [None]:
# Check that the function works with random sentences from Fridman and his guests

#Should return LEXFRIDMAN
print(who_said_this(random_sent_fridman, corpus_fridman_vec, corpus_resto_vec))

#Should return RESTO
print(who_said_this(random_sent_resto, corpus_fridman_vec, corpus_resto_vec))

In [None]:
def measure_predictive_capability(
        full_df: pd.DataFrame, 
        corpus_fridman_vec: np.ndarray, 
        corpus_resto_vec: np.ndarray
):

    """
    Create a confusion matrix to determine how good the predictions are with cosine similarity.

    Parameters
    -------------------------------------------------------------------------------------------
    full_df: pd.DataFrame
        The original dataframe with tagged speaker interventions
    corpus_fridman_vec: np.ndarray
        All the text said by Lex Fridman, represented by a 350 number embedding
    corpus_resto_vec: np.ndarray
        All the text said by Fridman's guests, represented by a 350 number embedding
    """

    data = [
        {
            'predicted_label': who_said_this(row['text'], corpus_fridman_vec, corpus_resto_vec),
            'true_label': row['speaker']
        
        } for _, row in full_df.iterrows()
    ]

    # Create DataFrame in one go.
    df = pd.DataFrame(data)
        
    return confusion_matrix(df['true_label'], df['predicted_label'], labels=df['true_label'].unique())

In [None]:
def calculate_metrics(confusion_matrix: np.ndarray):

    """
    Function that calculates various metrics given a confusion matrix

    Parameters
    -----------------------------------------------------------------
    confusion_matrix: np.ndarray
        Confusion matrix, a numpy array with the true positives,
        true negatives, false positives and false negatives.
    """

    # Unpack the confusion matrix
    TN, FP, FN, TP = confusion_matrix.ravel()

    # Metrics
    # important: careful dividing by zero!
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    accuracy = (TP + TN) / np.sum(confusion_matrix)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision * 100, recall * 100, accuracy * 100, f1_score

In [None]:
# Get the confusion matrix
conf_matrix = measure_predictive_capability(full_df, corpus_fridman_vec, corpus_resto_vec)
print(conf_matrix)

print('----------------------')

# Get the metrics for the matrix
precision, recall, accuracy, f1_score = calculate_metrics(conf_matrix)
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1 Score: {f1_score:.2f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

labels = ['LEXFRIDMAN', 'RESTO']
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix', fontsize=15)
plt.show()
