# Summarization of Podcast Transcript

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#with open("/content/drive/MyDrive/podcast_text.txt", "r") as file:
 #   text = file.read()
#print(len(text))
text = "Introducing yourself to someone in a professional setting can be tricky, especially when facing an interview. Irrespective of your qualifications and experience, your way of self-introduction during an interview carries much weight when it comes to making a strong impression.As soon as you enter the room, exchange pleasantries and introduce yourself by saying your name. Keep this introduction short and concise before you go into detail when the interview starts."
text

'Introducing yourself to someone in a professional setting can be tricky, especially when facing an interview. Irrespective of your qualifications and experience, your way of self-introduction during an interview carries much weight when it comes to making a strong impression.As soon as you enter the room, exchange pleasantries and introduce yourself by saying your name. Keep this introduction short and concise before you go into detail when the interview starts.'

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.probability import FreqDist
from nltk.cluster.util import cosine_distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## TF-IDF Word Embedding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def preprocess_text(text):
  tokens = word_tokenize(text.lower())
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
  return tokens

In [None]:
def get_tfidf_scores(text):
  vectorizer = TfidfVectorizer(min_df=1)
  try:
    tf_idf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = dict(zip(feature_names, tf_idf_matrix.toarray()[0]))
    return scores
  except ValueError:
    return {}

In [None]:
def summarize(text, num_sentences):
    sentences = text.split(".")
    processed_sentences = [preprocess_text(sentence) for sentence in sentences]
    tfidf_scores = [get_tfidf_scores(" ".join(sentence)) for sentence in processed_sentences]

    sentence_scores = [sum(score.values()) / len(score) for score in tfidf_scores if len(score) != 0]

    top_sentences = sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)[:num_sentences]
    return ". ".join([sent for sent, _ in top_sentences])

In [None]:
summary = summarize(text,20)
summary

" A delight to be here.  You're just overwhelmed by all of this.  You just give up.  You don't have any time at all.  And it's 80% shorter.  And to me, that's an example.  I expect that.  Yeah.  Right.  Yeah.  Yeah.  At 8 o'clock exactly, I was in there and I was out of there by 815.  We all know this.  You had to wait.  Think of that.  Exactly.  Yeah.  Beautiful.  Yeah.  Right"

## Word2Vec Word Embedding

In [None]:
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
preprocessed_text = preprocess_text(text)
word2vec_model = Word2Vec([preprocessed_text], min_count=1, workers=4)

In [None]:
def sentence_similarity(sentence1, sentence2):
    s1 = preprocess_text(sentence1)
    s2 = preprocess_text(sentence2)
    s1_vector = sum(word2vec_model.wv[word] for word in s1) / len(s1)
    s2_vector = sum(word2vec_model.wv[word] for word in s2 if word in word2vec_model.wv) / len(s2)

    return cosine_similarity([s1_vector], [s2_vector])[0][0]

In [None]:
def extract_text_summary(text, num_sentences):
    sentences = re.split('\.|;|\n', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    scores = []
    for sentence in sentences:

        scores.append((sentence, sentence_similarity(text, sentence)))
    scores.sort(key=lambda x: x[1], reverse=True)
    summary = [score[0] for score in scores[:num_sentences]]
    return summary

In [None]:
summary = extract_text_summary(text,10)
summary1 = " ".join(sentence for sentence in summary)
summary1

"And that's one of the things that we like to think about friction is where can you put in good friction to stop bad friction? And, you know, so those solutions sort of adding one question, limiting it to four interviews, but still having four, not one, seem like they're finding this perfect balance of just enough friction, right? You know, you're not adding too much so that it becomes bad The minor but key wrinkle I'd like to touch on briefly, Alison, is if you're a middle manager, if you're the average Joe, even the top manager, it's easy to kind of think of the task as where do I take bad friction out? Instead, what we find is it's much better to focus on what the consequences and the consequences, giving employees the gift of time And to us, there's sort of a duality of friction fixers work 5 million people a year who want things like health insurance and food and stuff like that What about the role of the middle manager? You know, do you see adding and subtracting friction as a bi

## Glove Word Embedding

In [None]:
import numpy as np
from gensim.models import KeyedVectors

In [None]:
def load_glove_model(file_path):
    model = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            model[word] = vector
    return model

glove_model = load_glove_model(r"/content/drive/MyDrive/glove.6B.50d.txt")


In [None]:
def sentence_embedding(sentence):
    words = preprocess_text(sentence)
    word_vectors = [glove_model[word] for word in words if word in glove_model.keys()]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return None

In [None]:
def sentence_similarity(sentence1, sentence2):
    s1_vector = sentence_embedding(sentence1)
    s2_vector = sentence_embedding(sentence2)
    if s1_vector is not None and s2_vector is not None:
        return cosine_similarity([s1_vector], [s2_vector])[0][0]
    else:
        return 0.0

In [None]:
def extract_text_summary1(text, num_sentences):
    sentences = re.split('\.|;|\n', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    scores = []
    for sentence in sentences:
        scores.append((sentence, sentence_similarity(text, sentence)))
    scores.sort(key=lambda x: x[1], reverse=True)
    summary = [score[0] for score in scores[:num_sentences]]
    return summary

In [None]:
summary = extract_text_summary1(text,10)
summary2 = " ".join(sentence for sentence in summary)
summary2

"The minor but key wrinkle I'd like to touch on briefly, Alison, is if you're a middle manager, if you're the average Joe, even the top manager, it's easy to kind of think of the task as where do I take bad friction out? Instead, what we find is it's much better to focus on what the consequences and the consequences, giving employees the gift of time And that's one of the things that we like to think about friction is where can you put in good friction to stop bad friction? And, you know, so those solutions sort of adding one question, limiting it to four interviews, but still having four, not one, seem like they're finding this perfect balance of just enough friction, right? You know, you're not adding too much so that it becomes bad And this might have made sense when they were hiring the first 100 people, 200 people, the company they were going to build them with What about the role of the middle manager? You know, do you see adding and subtracting friction as a big part of that job

In [None]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [None]:
with open("/content/drive/MyDrive/reff.txt", "r") as file:
    ref = file.read()
print(len(ref))

3819


Rouge_score - Glove Embedding

In [None]:
scores = scorer.score(sum, ref)
scores

{'rouge1': Score(precision=0.38173652694610777, recall=0.6891891891891891, fmeasure=0.49132947976878605),
 'rougeL': Score(precision=0.15269461077844312, recall=0.2756756756756757, fmeasure=0.19653179190751446)}