In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

In [None]:
INPUT_DATA_DIR = '../../data/processed/'
INPUT_FILE_NAME = 'squashed_processed_data.pkl'

In [None]:
df = pd.read_pickle(INPUT_DATA_DIR + INPUT_FILE_NAME)
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                        use_idf=True,
                        ngram_range=(1,1), # considering only 1-grams
#                         min_df = 0.05,     # cut words present in less than 5% of documents
                        max_df = 0.3)      # cut words present in more than 30% of documents 
t0 = time()

tfidf = vectorizer.fit_transform(df['transcript'])
print("done in %0.3fs." % (time() - t0))

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 30
lda = LatentDirichletAllocation(n_components=n_topics,random_state=0)

topics = lda.fit_transform(tfidf)
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(lda.components_):
    t_words[t_id] = [vectorizer.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words

In [None]:
fig, ax = plt.subplots(figsize=(7,15), ncols=2, nrows=5)
plt.subplots_adjust(
    wspace  =  0.5,
    hspace  =  0.5
)
c=0
for row in range(0,5):
    for col in range(0,2):
        sns.barplot(x=word_strengths[c], y=t_words[c], color="red", ax=ax[row][col])
        c+=1
plt.show()

In [None]:
# Formulating a pipeline to insert a document and extract the topics pertinency
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('tfidf', vectorizer),
    ('lda', lda)
])

# take second element for sort
def takeSecond(elem):
    return elem[1]

In [None]:
def predict_best_topics_processed_string(string):
    t = pipe.transform([string])
    rev_topics = np.where(t>0.01)[1]
    rev_topics_score = t[0][rev_topics]

    rev_topscore = list(zip(rev_topics,rev_topics_score))
    rev_topscore.sort(key = takeSecond, reverse = True)
    return rev_topscore

def predict_best_topics_doc(df,did):
    string = df['transcript'].iloc[did]
    
    return predict_best_topics_processed_string(string)

In [None]:
def show_best_topics_doc(did):

    print('For document #'+str(did)+'...')

    rev_topscore = predict_best_topics_doc(did)
    
    print('\nAll relevant topics (in order of descending relevance): \n')
    for top in range(len(rev_topscore)):
        print(t_words[rev_topscore[top][0]])

    print('\nTranscript:\n',df['transcript'].iloc[document_id][:500],'...')
    print('\nTrue tags from ted_main.csv: \n',df['tags'].iloc[document_id])
    
def show_best_topics_string(string):
    rev_topscore = predict_best_topics_processed_string(string)
    
    print('\nAll relevant topics (in order of descending relevance): \n')
    for top in range(len(rev_topscore)):
        print(t_words[rev_topscore[top][0]])

In [None]:
def predict_tags(rev_topscore, threshold):
    relevant_topics = rev_topscore
    if relevant_topics == []:
        print('No relevant topics, unable to predict tags')
        return

    all_word_score = []
    for top in range(len(relevant_topics)):
        topic = t_words[rev_topscore[top][0]]
        score = rev_topscore[top][1]
        weights = word_strengths[rev_topscore[top][0]]
        combined_score = [0]*len(weights)
        for i in range(len(weights)):
            combined_score[i]= weights[i]*score
        word_score = list(zip(topic,combined_score))
        all_word_score += word_score
    all_word_score.sort(key = takeSecond, reverse = True)
    
    final_tags = []
    for pair in all_word_score:
        if pair[1] > threshold:
            final_tags.append(pair[0])
    return final_tags

In [None]:
def predict_accuracy(df):
    correct = 0
    total = len(df.index)
#     total = 1
    all_tags = []
    tag_count = 0
    threshold = 0
    taggable = 0
    empty = 0
#     print(squash_list)
#     print('should have printed')
    for tr in range(total):
        if tr%100 == 0:
            print(tr)
        tagged = False
        predicted_tags = predict_tags(predict_best_topics_doc(df,tr),threshold)
        given_tags = df['tags'][tr]
#         print(given_tags)
        squash = []
        for label in given_tags:
#             print(label)
            if label in squash_list:
#                 print(label)
                squash.append(label)
                tag_count += 1
                if label not in all_tags:
                    all_tags.append(label)
        left = False
        for squish in squash_list:
            if squish in given_tags:
                left = True
                    
        if predicted_tags != None:
            for word in predicted_tags:
                if word in squash:
                    tagged = True
                    correct += 1
        if tagged:
            taggable += 1
        
        if not left:
            empty += 1
            
    print(all_tags.sort() == squash_list.sort())
    print('Taggable:', taggable)
    print('Tag count:', tag_count)
    print('Percentage tagged:',taggable/total)
    percentage_correct_tag_count = (correct/tag_count)
    print('Percentage correct (tag_count):',percentage_correct_tag_count)


In [None]:
def predict_tags_input(threshold):
    string = input()
    predicted = predict_tags(predict_best_topics_processed_string(string),threshold)
    print(predicted)

In [None]:
your_input = input()
show_best_topics_string(your_input)