In [1]:
import sklearn
import pandas as pd
import numpy as np
import nltk
import wordcloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.ensemble import  VotingClassifier
import pickle
from Python_Functions.NLP_Model_Functions import stopwords_generator, lemmatize_stemming, TF_whole_text, TF_IDF_text_doc, topic_extract, \
                          lemmatize_preprocess, text_cleaning_sub, text_cleaning, stop_words

## Apply Supervised Learning on the Topics Modeling - Main Topic Classification

1. Word freq under a certain topics: calculate the ratio of a words appears among all the words in a documents of topics

2. Topic Classification with Niave bayes


In [2]:
refer_model = pickle.load(open("models/trained_models/NaiveBeyasClassifier_ReferenceText.pckl","rb"))

In [3]:
classfier_mdl = refer_model[0]
vectorizer_train = refer_model[1]

In [4]:
df = pd.read_csv('ready_for_model_dataset/df_test.csv').drop(['Unnamed: 0'], axis = 1)
df.head()


Unnamed: 0,Title,Video_ID,Category,Age_Restricted,final_corrected_version_sentences_txt,final_corrected_version_txt,sentence_level_timstamp_min_sec,sentence_level_timstamp_max_sec,sentence_level_timstamp_min_minute,sentence_level_timstamp_max_minute,duration,Length_(min),Views_(thous),main_topics,reference_text,reference_text_cleaned,coverage_quality
0,Children Learn About The Ant,cXUCUvcscXs,Ant,False,welcome to my tree house come and look,welcome to my tree house come and look. at thi...,0.56,4.08,0.009333,0.068,3.52,5.333333,921.491,insect_ant,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium
1,Children Learn About The Ant,cXUCUvcscXs,Ant,False,at this,welcome to my tree house come and look. at thi...,2.56,6.08,0.042667,0.101333,3.52,5.333333,921.491,insect_ant,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium
2,Children Learn About The Ant,cXUCUvcscXs,Ant,False,this is my ant farm,welcome to my tree house come and look. at thi...,4.08,8.639,0.068,0.143983,4.559,5.333333,921.491,insect_ant,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium
3,Children Learn About The Ant,cXUCUvcscXs,Ant,False,ants are amazing creatures lots of,welcome to my tree house come and look. at thi...,6.08,10.559,0.101333,0.175983,4.479,5.333333,921.491,insect_ant,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium
4,Children Learn About The Ant,cXUCUvcscXs,Ant,False,insects live by themselves and have to,welcome to my tree house come and look. at thi...,8.639,12.88,0.143983,0.214667,4.241,5.333333,921.491,insect_ant,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium


In [5]:
# encode the classes to predicted
df['main_topics_coded'] = df.main_topics.map({'bird_tailorbird' : 0, 'insect_ant': 1, 'bird_peacock' :2, 'insect_beetle' : 3,
       'bird_flamingo': 4})

Transcript_text = df[['final_corrected_version_txt', 'main_topics_coded', 'coverage_quality']].drop_duplicates().reset_index(drop = True)

In [6]:
Transcript_text.head()

Unnamed: 0,final_corrected_version_txt,main_topics_coded,coverage_quality
0,welcome to my tree house come and look. at thi...,1,medium
1,riddle i bet you don't know who i am. do you. ...,1,low
2,hey my friends how you guys do it I am. so exc...,1,medium
3,[Music]. hi[Music]. did you find an anthill Wo...,1,
4,"The ants go marching one by one. Hurrah, hurra...",1,bad


In [7]:
X_video_transc_txt = Transcript_text['final_corrected_version_txt'].values
y_video_transc_txt = Transcript_text['main_topics_coded'].values
X_sentences_video_transc_txt = df['final_corrected_version_sentences_txt'].values
y_sentences_video_transc_txt = df['main_topics_coded'].values

In [8]:
X_test_dtm_txt = vectorizer_train.transform(X_video_transc_txt)
X_test_dtm_txt

<80x2757 sparse matrix of type '<class 'numpy.int64'>'
	with 14871 stored elements in Compressed Sparse Row format>

In [9]:
X_test_dtm_sentence = vectorizer_train.transform(X_sentences_video_transc_txt)
X_test_dtm_sentence

<37675x2757 sparse matrix of type '<class 'numpy.int64'>'
	with 177907 stored elements in Compressed Sparse Row format>

In [10]:
classfier_mdl.predict(X_test_dtm_txt)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 2,
       2, 2, 1, 2, 1, 1, 0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 2, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 3, 4])

In [11]:
y_pred_txt = classfier_mdl.predict(X_test_dtm_txt)
y_pred_proba_txt = classfier_mdl.predict_proba(X_test_dtm_txt)

y_pred_sentence =  classfier_mdl.predict(X_test_dtm_sentence)
y_pred_proba_sentence =  classfier_mdl.predict_proba(X_test_dtm_sentence)

In [12]:
class_proba_df_txt = pd.DataFrame(y_pred_proba_txt, columns = ['txt_0_proba','txt_1_proba','txt_2_proba','txt_3_proba','txt_4_proba'])
Transcript_text = pd.concat([Transcript_text, class_proba_df_txt], axis = 1)
Transcript_text['classified_topic_coded_txt'] = y_pred_txt
Transcript_text.head(10)

Unnamed: 0,final_corrected_version_txt,main_topics_coded,coverage_quality,txt_0_proba,txt_1_proba,txt_2_proba,txt_3_proba,txt_4_proba,classified_topic_coded_txt
0,welcome to my tree house come and look. at thi...,1,medium,0.0,1.0,0.0,0.0,0.0,1
1,riddle i bet you don't know who i am. do you. ...,1,low,0.0,1.0,0.0,0.0,0.0,1
2,hey my friends how you guys do it I am. so exc...,1,medium,0.0,1.0,0.0,0.0,0.0,1
3,[Music]. hi[Music]. did you find an anthill Wo...,1,,0.0,1.0,0.0,0.0,0.0,1
4,"The ants go marching one by one. Hurrah, hurra...",1,bad,4.75616e-291,1.0,1.782647e-216,5.613159000000001e-165,0.0,1
5,NARRATOR: AND NOW IT'S\n TIME FOR SOME MORE......,1,medium,0.0,1.0,0.0,0.0,0.0,1
6,[Music]. imagine a day. full of negative thoug...,1,low,1.141386e-310,1.0,0.0,0.0,0.0,1
7,the life cycle of an ant. ants are social inse...,1,low,1.28236e-297,1.0,0.0,0.0,0.0,1
8,hey little ant written by Phillip and. Hannah ...,1,low,7.346046e-308,1.0,2.703875e-264,0.0,0.0,1
9,MALE NARRATOR:\nHave you ever wondered. how ph...,1,medium,6.779747e-222,1.0,1.4294499999999999e-235,2.5883219999999997e-278,0.0,1


In [13]:
class_proba_df_sentence = pd.DataFrame(y_pred_proba_sentence, columns = ['sentence_0_proba','sentence_1_proba','sentence_2_proba','sentence_3_proba','sentence_4_proba'])
df = pd.concat([df, class_proba_df_sentence], axis = 1)
df['classified_topic_coded_sentences'] = y_pred_sentence
df.head(5)

Unnamed: 0,Title,Video_ID,Category,Age_Restricted,final_corrected_version_sentences_txt,final_corrected_version_txt,sentence_level_timstamp_min_sec,sentence_level_timstamp_max_sec,sentence_level_timstamp_min_minute,sentence_level_timstamp_max_minute,...,reference_text,reference_text_cleaned,coverage_quality,main_topics_coded,sentence_0_proba,sentence_1_proba,sentence_2_proba,sentence_3_proba,sentence_4_proba,classified_topic_coded_sentences
0,Children Learn About The Ant,cXUCUvcscXs,Ant,False,welcome to my tree house come and look,welcome to my tree house come and look. at thi...,0.56,4.08,0.009333,0.068,...,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium,1,5.326544e-07,8.696503e-09,7.97637e-13,0.9999995,4.8279169999999994e-34,3
1,Children Learn About The Ant,cXUCUvcscXs,Ant,False,at this,welcome to my tree house come and look. at thi...,2.56,6.08,0.042667,0.101333,...,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium,1,0.1740426,0.4629116,0.1370454,0.01256022,0.2134402,1
2,Children Learn About The Ant,cXUCUvcscXs,Ant,False,this is my ant farm,welcome to my tree house come and look. at thi...,4.08,8.639,0.068,0.143983,...,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium,1,1.641765e-11,0.9999988,1.096464e-11,1.23837e-06,3.798267e-14,1
3,Children Learn About The Ant,cXUCUvcscXs,Ant,False,ants are amazing creatures lots of,welcome to my tree house come and look. at thi...,6.08,10.559,0.101333,0.175983,...,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium,1,4.227957e-08,0.3066765,1.181662e-12,0.6933235,1.806161e-16,3
4,Children Learn About The Ant,cXUCUvcscXs,Ant,False,insects live by themselves and have to,welcome to my tree house come and look. at thi...,8.639,12.88,0.143983,0.214667,...,['Ants are one of the most common insects that...,Ants are one of the most common insects that l...,medium,1,9.797236e-07,0.9619711,0.03802791,3.715481e-10,8.027377e-10,1


In [14]:
print(classification_report(y_video_transc_txt, y_pred_txt))

              precision    recall  f1-score   support

           0       0.62      0.36      0.45        14
           1       0.30      1.00      0.47        17
           2       0.90      0.50      0.64        18
           3       1.00      0.11      0.20        18
           4       1.00      0.31      0.47        13

    accuracy                           0.46        80
   macro avg       0.77      0.46      0.45        80
weighted avg       0.76      0.46      0.44        80



In [15]:
print(classification_report(y_pred_sentence, y_sentences_video_transc_txt))

              precision    recall  f1-score   support

           0       0.20      0.11      0.14      5661
           1       0.66      0.47      0.55     20343
           2       0.36      0.52      0.42      7442
           3       0.11      0.53      0.19      1488
           4       0.16      0.14      0.15      2741

    accuracy                           0.40     37675
   macro avg       0.30      0.35      0.29     37675
weighted avg       0.48      0.40      0.42     37675



In [16]:
# merge the txt level nd sentences level

df_rank = pd.merge(df, Transcript_text, 'left', 'final_corrected_version_txt')
df_rank.head()

Unnamed: 0,Title,Video_ID,Category,Age_Restricted,final_corrected_version_sentences_txt,final_corrected_version_txt,sentence_level_timstamp_min_sec,sentence_level_timstamp_max_sec,sentence_level_timstamp_min_minute,sentence_level_timstamp_max_minute,...,sentence_4_proba,classified_topic_coded_sentences,main_topics_coded_y,coverage_quality_y,txt_0_proba,txt_1_proba,txt_2_proba,txt_3_proba,txt_4_proba,classified_topic_coded_txt
0,Children Learn About The Ant,cXUCUvcscXs,Ant,False,welcome to my tree house come and look,welcome to my tree house come and look. at thi...,0.56,4.08,0.009333,0.068,...,4.8279169999999994e-34,3,1,medium,0.0,1.0,0.0,0.0,0.0,1
1,Children Learn About The Ant,cXUCUvcscXs,Ant,False,at this,welcome to my tree house come and look. at thi...,2.56,6.08,0.042667,0.101333,...,0.2134402,1,1,medium,0.0,1.0,0.0,0.0,0.0,1
2,Children Learn About The Ant,cXUCUvcscXs,Ant,False,this is my ant farm,welcome to my tree house come and look. at thi...,4.08,8.639,0.068,0.143983,...,3.798267e-14,1,1,medium,0.0,1.0,0.0,0.0,0.0,1
3,Children Learn About The Ant,cXUCUvcscXs,Ant,False,ants are amazing creatures lots of,welcome to my tree house come and look. at thi...,6.08,10.559,0.101333,0.175983,...,1.806161e-16,3,1,medium,0.0,1.0,0.0,0.0,0.0,1
4,Children Learn About The Ant,cXUCUvcscXs,Ant,False,insects live by themselves and have to,welcome to my tree house come and look. at thi...,8.639,12.88,0.143983,0.214667,...,8.027377e-10,1,1,medium,0.0,1.0,0.0,0.0,0.0,1


In [17]:
df_rank.columns

Index(['Title', 'Video_ID', 'Category', 'Age_Restricted',
       'final_corrected_version_sentences_txt', 'final_corrected_version_txt',
       'sentence_level_timstamp_min_sec', 'sentence_level_timstamp_max_sec',
       'sentence_level_timstamp_min_minute',
       'sentence_level_timstamp_max_minute', 'duration', 'Length_(min)',
       'Views_(thous)', 'main_topics', 'reference_text',
       'reference_text_cleaned', 'coverage_quality_x', 'main_topics_coded_x',
       'sentence_0_proba', 'sentence_1_proba', 'sentence_2_proba',
       'sentence_3_proba', 'sentence_4_proba',
       'classified_topic_coded_sentences', 'main_topics_coded_y',
       'coverage_quality_y', 'txt_0_proba', 'txt_1_proba', 'txt_2_proba',
       'txt_3_proba', 'txt_4_proba', 'classified_topic_coded_txt'],
      dtype='object')

### Implement the extraction of the Insights from the classifier model 

- first pick out the correctly classified rows and label it with a new columns "Correctly classified" (done)
- correctly classified text/sentences will be helped ranked later from the topic extraction algorothm (HOPFULLY) (pending ......)
- compare the highsest probability of the classified class vs the actual class probaility 
  * the difference between them, the samller the better
  * the smaller or appraoch to 0 ==> better text quality under a specific class
  * the bigger the probability difference ==> the worse on transcript quality under a specific class



#### Lesson: what is important to have a better trained classifed model from the refernece text

- do we use perfect text to train the classiifer model? or we can add more variance of text to make the model more generlelziable? 

- the purposes of the classifier model is to give an estimate on if the transcript text belongs to a class?

- assumption: the classifier label from the transcript are assumed in the first place, our purpose is to figure out a way to help us measure the legitimacy of the main class from each video transcript 


In [18]:
df_rank['if_classified_correctly_txt'] = df_rank['classified_topic_coded_txt'] == df_rank['main_topics_coded_x']
df_rank['if_classified_correctly_sentences'] = df_rank['classified_topic_coded_sentences'] == df_rank['main_topics_coded_x']

df_rank['max_proba_sentence'] = df_rank[['sentence_0_proba', 'sentence_1_proba', 'sentence_2_proba','sentence_3_proba', 'sentence_4_proba']].max(axis=1)

df_rank['max_proba_txt'] = df_rank[['txt_0_proba', 'txt_1_proba', 'txt_2_proba', 'txt_3_proba','txt_4_proba']].max(axis=1)


In [19]:
gaps_sentences = []
sentence_actual_main_topic_classified_proba = []
proba_col_lst = ['sentence_0_proba', 'sentence_1_proba', 'sentence_2_proba','sentence_3_proba', 'sentence_4_proba']

for index, row in df_rank.iterrows():
    
    if row['if_classified_correctly_sentences'] == False:
        actual_lbl = str(row['main_topics_coded_x'])
        actual_target_col = [i for i in proba_col_lst if actual_lbl in i][0]
        
        gaps_sentences.append(np.round(1 - row[actual_target_col] , 6))
        sentence_actual_main_topic_classified_proba.append(row[actual_target_col])

    else:
        gaps_sentences.append(np.round(1 - row['max_proba_sentence'], 6)) 
        sentence_actual_main_topic_classified_proba.append(row['max_proba_sentence'])
        
gaps_txt = []
txt_actual_main_topic_classified_proba = []
proba_col_lst = ['txt_0_proba', 'txt_1_proba', 'txt_2_proba', 'txt_3_proba','txt_4_proba']

for index, row in df_rank.iterrows():
    
    if row['if_classified_correctly_txt'] == False:
        actual_lbl = str(row['main_topics_coded_x'])
        actual_target_col = [i for i in proba_col_lst if actual_lbl in i][0]
        
#         gaps_txt.append(np.round(np.abs(row['max_proba_txt'] - row[actual_target_col] ), 6))
        gaps_txt.append(np.round(np.abs(1 - row[actual_target_col] ), 6))
        txt_actual_main_topic_classified_proba.append(row[actual_target_col])

    else:
        gaps_txt.append(np.round(1 - row['max_proba_txt'], 6)) 
        txt_actual_main_topic_classified_proba.append(row['max_proba_txt'])
        
df_rank['sentence_classified_vs_actual_gap'] = gaps_sentences
df_rank['txt_classified_vs_actual_gap'] = gaps_txt

df_rank['sentence_main_topic_classified_proba'] = sentence_actual_main_topic_classified_proba
df_rank['txt_main_topic_classified_proba'] = txt_actual_main_topic_classified_proba
        

### Implement the extraction of the Insights from the classifier model 

- first pick out the correctly classified rows and label it with a new columns "Correctly classified" (done)
- correctly classified text/sentences will be helped ranked later from the topic extraction algorothm (HOPFULLY) (pending ......)
- compare the highsest probability of the classified class vs the actual class probaility 
  * the difference between them, the samller the better
  * the smaller or appraoch to 0 ==> better text quality under a specific class
  * the bigger the probability difference ==> the worse on transcript quality under a specific class



#### Lesson: what is important to have a better trained classiifed model from the refernece text

- do we use perfect text to train the classiifer model? or we can add more variance of text to make the model more generlelziable? 

- the purposes of the classifier model is to give an estimate on if the transcript text belongs to a class?

- assumption: the classifier label from the transcript are assumed in the first place, our purpose is to figure out a way to help us measure the legitimacy of the main class from each video transcript 


In [20]:
df_rank['if_classified_correctly_txt'] = df_rank['classified_topic_coded_txt'] == df_rank['main_topics_coded_x']
df_rank['if_classified_correctly_sentences'] = df_rank['classified_topic_coded_sentences'] == df_rank['main_topics_coded_x']

df_rank['max_proba_sentence'] = df_rank[['sentence_0_proba', 'sentence_1_proba', 'sentence_2_proba','sentence_3_proba', 'sentence_4_proba']].max(axis=1)

df_rank['max_proba_txt'] = df_rank[['txt_0_proba', 'txt_1_proba', 'txt_2_proba', 'txt_3_proba','txt_4_proba']].max(axis=1)



In [21]:
gaps_sentences = []
sentence_actual_main_topic_classified_proba = []
proba_col_lst = ['sentence_0_proba', 'sentence_1_proba', 'sentence_2_proba','sentence_3_proba', 'sentence_4_proba']

for index, row in df_rank.iterrows():
    
    if row['if_classified_correctly_sentences'] == False:
        actual_lbl = str(row['main_topics_coded_x'])
        actual_target_col = [i for i in proba_col_lst if actual_lbl in i][0]
        
        gaps_sentences.append(np.round(1 - row[actual_target_col] , 6))
        sentence_actual_main_topic_classified_proba.append(row[actual_target_col])

    else:
        gaps_sentences.append(np.round(1 - row['max_proba_sentence'], 6)) 
        sentence_actual_main_topic_classified_proba.append(row['max_proba_sentence'])
        
gaps_txt = []
txt_actual_main_topic_classified_proba = []
proba_col_lst = ['txt_0_proba', 'txt_1_proba', 'txt_2_proba', 'txt_3_proba','txt_4_proba']

for index, row in df_rank.iterrows():
    
    if row['if_classified_correctly_txt'] == False:
        actual_lbl = str(row['main_topics_coded_x'])
        actual_target_col = [i for i in proba_col_lst if actual_lbl in i][0]
        
#         gaps_txt.append(np.round(np.abs(row['max_proba_txt'] - row[actual_target_col] ), 6))
        gaps_txt.append(np.round(np.abs(1 - row[actual_target_col] ), 6))
        txt_actual_main_topic_classified_proba.append(row[actual_target_col])

    else:
        gaps_txt.append(np.round(1 - row['max_proba_txt'], 6)) 
        txt_actual_main_topic_classified_proba.append(row['max_proba_txt'])
        
df_rank['sentence_classified_vs_actual_gap'] = gaps_sentences
df_rank['txt_classified_vs_actual_gap'] = gaps_txt

df_rank['sentence_main_topic_classified_proba'] = sentence_actual_main_topic_classified_proba
df_rank['txt_main_topic_classified_proba'] = txt_actual_main_topic_classified_proba
        

In [22]:
df_rank.head()

Unnamed: 0,Title,Video_ID,Category,Age_Restricted,final_corrected_version_sentences_txt,final_corrected_version_txt,sentence_level_timstamp_min_sec,sentence_level_timstamp_max_sec,sentence_level_timstamp_min_minute,sentence_level_timstamp_max_minute,...,txt_4_proba,classified_topic_coded_txt,if_classified_correctly_txt,if_classified_correctly_sentences,max_proba_sentence,max_proba_txt,sentence_classified_vs_actual_gap,txt_classified_vs_actual_gap,sentence_main_topic_classified_proba,txt_main_topic_classified_proba
0,Children Learn About The Ant,cXUCUvcscXs,Ant,False,welcome to my tree house come and look,welcome to my tree house come and look. at thi...,0.56,4.08,0.009333,0.068,...,0.0,1,True,False,0.999999,1.0,1.0,0.0,8.696503e-09,1.0
1,Children Learn About The Ant,cXUCUvcscXs,Ant,False,at this,welcome to my tree house come and look. at thi...,2.56,6.08,0.042667,0.101333,...,0.0,1,True,True,0.462912,1.0,0.537088,0.0,0.4629116,1.0
2,Children Learn About The Ant,cXUCUvcscXs,Ant,False,this is my ant farm,welcome to my tree house come and look. at thi...,4.08,8.639,0.068,0.143983,...,0.0,1,True,True,0.999999,1.0,1e-06,0.0,0.9999988,1.0
3,Children Learn About The Ant,cXUCUvcscXs,Ant,False,ants are amazing creatures lots of,welcome to my tree house come and look. at thi...,6.08,10.559,0.101333,0.175983,...,0.0,1,True,False,0.693323,1.0,0.693324,0.0,0.3066765,1.0
4,Children Learn About The Ant,cXUCUvcscXs,Ant,False,insects live by themselves and have to,welcome to my tree house come and look. at thi...,8.639,12.88,0.143983,0.214667,...,0.0,1,True,True,0.961971,1.0,0.038029,0.0,0.9619711,1.0


In [23]:
print(df_rank.drop_duplicates().shape)
print(df_rank.shape)

(11206, 40)
(37871, 40)


In [24]:
df_rank.to_csv("model_output_dataset/video_tarnscript_main_topic_classifier_rank.csv")

## Apply Unsupervised Learning on the topics - Sub-Topics Extraction

Apply the LDA model to the new text/sentences documents from video transcript to get the topic probability distribution for a new document based on a trained model


https://radimrehurek.com/gensim/models/ldamodel.html

https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [25]:
from Python_Functions.NLP_Model_Functions import stopwords_generator, lemmatize_stemming, TF_whole_text, TF_IDF_text_doc, topic_extract, \
                          lemmatize_preprocess, text_cleaning_sub, text_cleaning, stop_words

In [26]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
# np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/mikewu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
def lemmatize_stemming(text):
    return SnowballStemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
#     print(gensim.utils.simple_preprocess(text))
    for token in gensim.utils.simple_preprocess(text):
#         print(token)
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

# print out the new extacted sub-topics from tne new doc based on the trained model
def pred_new_doc_extracted_topics(lda_mdl, bow_vector, wrd_n):
#     res_new_doc = lda_mdl[bow_vector]
#     for index, score in sorted(lda_mdl[bow_vector], key=lambda tup: -1*tup[1]):
#         print("Score: {}\t Topic: {}".format(score, lda_mdl.print_topic(index, wrd_n)))
    return lda_mdl[bow_vector]

In [28]:
refer_lda_models = pickle.load(open('models/trained_models/LDA_Subtopic_Models_ReferenceText.pckl',"rb"))

In [29]:
# count the number of sub-topics_traineds
main_topics_lst = []
cnt_extracted_subtopics_train = []
for lda_dict in refer_lda_models:
    key_val = list(lda_dict.keys())[0]
    
    main_topics_lst.append(key_val)
    
    cnt = 0
    for i in lda_dict[key_val][0].print_topics():
#         print(i)
        cnt += 1
    cnt_extracted_subtopics_train.append(cnt)

total_train_subtopic_df = pd.DataFrame({"main_topic" : main_topics_lst, 
              "total_train_subtopic_cnt" : cnt_extracted_subtopics_train })


In [30]:
total_train_subtopic_df

Unnamed: 0,main_topic,total_train_subtopic_cnt
0,insect_ant,20
1,insect_beetle,20
2,bird_flamingo,20
3,bird_peacock,20
4,bird_tailorbird,18


#### Subtopic/Concept/Fact/Relevance Measurement 

- count the # of subtopics lda modesl trained on reference text ==> n_trained_subtopics

- count the # of subtopics lda model applied on new text ==> n_test_subtopics_txt

- count the # of subtopics lda model applied on new sentences ==> n_test_subtopics_sentence

- overall subtopic coverage probability: 

- the probability of a subtopic on a new text ==> P_txt_subtopic_i

- the probability of a subtopic on a new sentences ==> P_sentences_subtopic_i



In [31]:
main_topics_lst = list(total_train_subtopic_df.main_topic)
# ['bird_tailorbird', 'insect_ant', 'bird_peacock', 'insect_beetle','bird_flamingo']

txt_subtopics_extracted_dict = {}

sentences_subtopics_extracted_dict = {}

train_lda_model_topics_dict = {}

for j in range(len(main_topics_lst)):
    
    print("#################################")
    print("main topics: {}".format(main_topics_lst[j]))
    print("------------------------------------")
    
    # read the pre-train lda model from a specific main topic
    lda_mdl = refer_lda_models[j][main_topics_lst[j]][0]
    
    # extract the trained model topics details and topics indexs
    lda_mdl_detail = lda_mdl.print_topics()
    
    # extract index and topics formula into lists
    trained_extra_subtopics_idx_lst = [i[0] for i in lda_mdl_detail]
    
    trained_extra_subtopics_lst = [i[1] for i in lda_mdl_detail]
    
    train_lda_model_topics_dict[main_topics_lst[j]] = []
    
    train_lda_model_topics_dict[main_topics_lst[j]].append(trained_extra_subtopics_idx_lst)
    
    train_lda_model_topics_dict[main_topics_lst[j]].append(trained_extra_subtopics_lst)
    
    # read the pre-train dictionary from a refernece text under specific main topic
    train_txt_dict_vec = refer_lda_models[j][main_topics_lst[j]][1]
    
    file_lst = list(df_rank[df_rank['main_topics'] == main_topics_lst[j]]['Title'].unique())

    for i in range(len(file_lst)):

        # read the whole txt transcript based on specific main topics
        new_transripts = list(df_rank[(df_rank['Title'] == file_lst[i])]['final_corrected_version_txt'].drop_duplicates())

        print(file_lst[i])

        # collect transcript 
        for new_transript in new_transripts:
            # apply the lda model on the new transcript
            bow_vector_txt = train_txt_dict_vec.doc2bow(preprocess(new_transript))

            pred_res_txt = pred_new_doc_extracted_topics(lda_mdl = lda_mdl, bow_vector = bow_vector_txt, wrd_n = 10)

            txt_subtopics_extracted_dict[file_lst[i]]= []

            tpc_idxs = []
            tpc_probas = []
            main_topics = []
            if pred_res_txt != []:
                for res in pred_res_txt:
                    tpc_idxs.append(res[0])
                    tpc_probas.append(res[1])
                    main_topics.append(main_topics_lst[j])

            txt_subtopics_extracted_dict[file_lst[i]].append(tpc_idxs)
            txt_subtopics_extracted_dict[file_lst[i]].append(tpc_probas)
            txt_subtopics_extracted_dict[file_lst[i]].append(main_topics)

        # read all the sentences based on specific main topics
        new_sentences_doc = list(df_rank[(df_rank['Title'] == file_lst[i])]['final_corrected_version_sentences_txt'])
        
        sentences_subtopics_extracted_dict[file_lst[i]] = []
        # apply the lda model on the new sentences
        for new_sentence in new_sentences_doc:

            bow_vector_sent = train_txt_dict_vec.doc2bow(preprocess(new_sentence))

            pred_res_sent = pred_new_doc_extracted_topics(lda_mdl = lda_mdl, bow_vector = bow_vector_sent, wrd_n = 10)

            one_sentence_dict = {}

            one_sentence_dict[new_sentence] = []

            tpc_idxs = []
            tpc_probas = []
            main_topics = []
            if pred_res_txt != []:
                for res in pred_res_txt:
                    tpc_idxs.append(res[0])
                    tpc_probas.append(res[1])
                    main_topics.append(main_topics_lst[j])
                    
            one_sentence_dict[new_sentence].append(tpc_idxs)
            one_sentence_dict[new_sentence].append(tpc_probas)
            one_sentence_dict[new_sentence].append(main_topics)
           
            sentences_subtopics_extracted_dict[file_lst[i]].append(one_sentence_dict)
#             print(sentences_subtopics_extracted_dict)




#################################
main topics: insect_ant
------------------------------------
Children Learn About The Ant
My Animal Friends -The Different Types of Ants | Bugs for Kids | Wizz | TV Shows for Kids
🐜 ALL ABOUT ANTS 🐜 | 5 AMAZING FACTS ABOUT ANTS 🤯 | EXPLORER MAX
Ants | Science for Kids
The Ants Go Marching | Kids Songs | Super Simple Songs
Army Ant 🐜 | Amazing Animals
Automatic Negative Thoughts - Meet the ANT Buddies!
The Life Cycle Of An Ant - Ant Life Cycle Lesson For Kids
Hey, Little Ant
Teeny Tiny Ants | Ready, Set, Zoom!
Antiks - Ants at work - Animated comedy for kids
Diana and Roma learn the alphabet and how to count
The Story of the Ant Hill - Values for Kids - Learning insects for kids
Animanimals: Ant
7 FUN FACTS ABOUT ANTS! ANT! FACTS FOR KIDS! Learning Colors! Funny! Sock Puppet!
insects_intro_ant_v
#################################
main topics: insect_beetle
------------------------------------
Insects for Kids | Have fun learning all about different kinds

In [32]:
# based on the extracted subtopics and its stats from the LDA model on the Transcript text, formluate a dataframe

txt_subtopics_extracted_df = pd.DataFrame()

for key, ele in txt_subtopics_extracted_dict.items():
    
    txt_subtopics_extracted_df_temp = pd.DataFrame()
    
    txt_subtopics_extracted_df_temp["txt_subtopics_indx"] = ele[0]
    
    txt_subtopics_extracted_df_temp["txt_subtopics_proba"] = ele[1]
    
    txt_subtopics_extracted_df_temp["main_topic"] = ele[2]
    
    txt_subtopics_extracted_df_temp['Title'] = key
    
    txt_subtopics_extracted_df = pd.concat([txt_subtopics_extracted_df, txt_subtopics_extracted_df_temp], axis = 0)
    
txt_subtopics_extracted_df = txt_subtopics_extracted_df.reset_index(drop = True)

txt_groupby_df = txt_subtopics_extracted_df.groupby(["main_topic", 'Title'\
                                                          ]).agg({'txt_subtopics_indx': "count"\
                                                                 }).reset_index().rename(columns={'txt_subtopics_indx': 'txt_subtopics_count'})

# print(sent_groupby_df.columns)

txt_subtopics_extracted_df= pd.merge(txt_subtopics_extracted_df, txt_groupby_df, how = 'left', on = ['main_topic', 'Title'])

txt_subtopics_extracted_df= pd.merge(txt_subtopics_extracted_df, total_train_subtopic_df, how = 'left', on = ['main_topic'])

txt_subtopics_extracted_df['subtopic_cv_ratio'] = txt_subtopics_extracted_df['txt_subtopics_count']/ txt_subtopics_extracted_df['total_train_subtopic_cnt']

txt_subtopics_extracted_df.head()

Unnamed: 0,txt_subtopics_indx,txt_subtopics_proba,main_topic,Title,txt_subtopics_count,total_train_subtopic_cnt,subtopic_cv_ratio
0,1,0.079715,insect_ant,Children Learn About The Ant,12,20,0.6
1,4,0.070893,insect_ant,Children Learn About The Ant,12,20,0.6
2,12,0.082934,insect_ant,Children Learn About The Ant,12,20,0.6
3,16,0.012956,insect_ant,Children Learn About The Ant,12,20,0.6
4,18,0.027877,insect_ant,Children Learn About The Ant,12,20,0.6


In [33]:
txt_subtopics_extracted_df.to_csv("model_output_dataset/video_tarnscript_txt_subtopic_extraction.csv")

In [34]:
# based on the extracted subtopics and its stats from the LDA model on the Transcript Sentences, formluate a dataframe

# based on the extracted subtopics and its stats from the LDA model on the Transcript text, formluate a dataframe

sentence_subtopics_extracted_df = pd.DataFrame()

for key, eles in sentences_subtopics_extracted_dict.items():
    
    sentence_subtopics_extracted_df_temp = pd.DataFrame()
    
    for ele in eles:
    
        sentc = list(ele.keys())[0]

        sentence_subtopics_extracted_df_temp["sentences_subtopics_indx"] = ele[sentc][0]

        sentence_subtopics_extracted_df_temp["sentences_subtopics_proba"] = ele[sentc][1]

        sentence_subtopics_extracted_df_temp["main_topic"] = ele[sentc][2]

        sentence_subtopics_extracted_df_temp["sentences"] = sentc

        sentence_subtopics_extracted_df_temp['Title'] = key

        sentence_subtopics_extracted_df = pd.concat([sentence_subtopics_extracted_df, sentence_subtopics_extracted_df_temp], axis = 0)

sentence_subtopics_extracted_df= sentence_subtopics_extracted_df.reset_index(drop = True)

# print(sentence_subtopics_extracted_df.columns)

sent_groupby_df = sentence_subtopics_extracted_df.groupby(["main_topic", 'Title', 'sentences'\
                                                          ]).agg({"sentences_subtopics_indx": "count"\
                                                                 }).reset_index().rename(columns={'sentences_subtopics_indx': 'sentences_subtopics_count'})

# print(sent_groupby_df.columns)

sentence_subtopics_extracted_df= pd.merge(sentence_subtopics_extracted_df, sent_groupby_df, how = 'left', on = ['main_topic', 'Title', 'sentences'])

sentence_subtopics_extracted_df.head()


Unnamed: 0,sentences_subtopics_indx,sentences_subtopics_proba,main_topic,sentences,Title,sentences_subtopics_count
0,1,0.079715,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12
1,4,0.070893,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12
2,12,0.082934,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12
3,16,0.012956,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12
4,18,0.027877,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12


In [35]:
sentence_subtopics_extracted_df.to_csv("model_output_dataset/video_tarnscript_sentences_subtopic_extraction.csv")

In [36]:
trained_lda_subtopics_extracted_df = pd.DataFrame()

for key, ele in train_lda_model_topics_dict.items():
    
    trained_lda_subtopics_extracted_df_temp = pd.DataFrame()
    
    trained_lda_subtopics_extracted_df_temp["trained_subtopics_indx"] = ele[0]
    
    trained_lda_subtopics_extracted_df_temp["trained_subtopics_formula"] = ele[1]
    
    trained_lda_subtopics_extracted_df_temp["main_topic"] = key
    
    trained_lda_subtopics_extracted_df = pd.concat([trained_lda_subtopics_extracted_df, trained_lda_subtopics_extracted_df_temp], axis = 0)
    
trained_lda_subtopics_extracted_df= trained_lda_subtopics_extracted_df.reset_index(drop = True)

groupby_df = trained_lda_subtopics_extracted_df.groupby("main_topic").agg({"trained_subtopics_indx": "count"\
                                                                    }).reset_index().rename(columns={"trained_subtopics_indx": 'trained_subtopics_count'})

trained_lda_subtopics_extracted_df= pd.merge(trained_lda_subtopics_extracted_df, groupby_df, how = 'left', on = 'main_topic')

trained_lda_subtopics_extracted_df.head()
    

Unnamed: 0,trained_subtopics_indx,trained_subtopics_formula,main_topic,trained_subtopics_count
0,29,"0.069*""one"" + 0.068*""queen"" + 0.037*""harvester...",insect_ant,20
1,14,"0.123*""colony"" + 0.084*""like"" + 0.054*""say"" + ...",insect_ant,20
2,28,"0.053*""pheromone"" + 0.048*""u"" + 0.038*""coopera...",insect_ant,20
3,1,"0.130*""ant"" + 0.029*""also"" + 0.024*""house"" + 0...",insect_ant,20
4,26,"0.046*""society"" + 0.043*""get"" + 0.041*""differe...",insect_ant,20


In [37]:
## join the sentences level new doc extrated probaility with the trained subtopics 

sentence_subtopics_extracted_eval_df = pd.merge(sentence_subtopics_extracted_df, trained_lda_subtopics_extracted_df,\
         left_on=['sentences_subtopics_indx','main_topic'], right_on=['trained_subtopics_indx','main_topic'], how='left')

sentence_subtopics_extracted_eval_df.head()


Unnamed: 0,sentences_subtopics_indx,sentences_subtopics_proba,main_topic,sentences,Title,sentences_subtopics_count,trained_subtopics_indx,trained_subtopics_formula,trained_subtopics_count
0,1,0.079715,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12,1.0,"0.130*""ant"" + 0.029*""also"" + 0.024*""house"" + 0...",20.0
1,4,0.070893,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12,4.0,"0.050*""like"" + 0.043*""system"" + 0.043*""make"" +...",20.0
2,12,0.082934,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12,12.0,"0.155*""live"" + 0.141*""ant"" + 0.070*""worker"" + ...",20.0
3,16,0.012956,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12,,,
4,18,0.027877,insect_ant,welcome to my tree house come and look,Children Learn About The Ant,12,18.0,"0.094*""know"" + 0.046*""hey"" + 0.039*""time"" + 0....",20.0


In [38]:
sentence_subtopics_extracted_eval_df.to_csv("model_output_dataset/sentence_subtopics_extracted_eval_df.csv")

In [39]:
## rank the txt subtopic extraction rate: new-doc-subtopic_n / main_topic_subtopic_n
sentence_subtopics_extracted_eval_df["new_sent_concept_coverage_pct"] = \
     np.round(sentence_subtopics_extracted_eval_df["sentences_subtopics_count"].astype(float) / sentence_subtopics_extracted_eval_df["trained_subtopics_count"], 4)


sentence_subtopics_extracted_eval_rank_df = sentence_subtopics_extracted_eval_df.sort_values(by = "new_sent_concept_coverage_pct")[['main_topic', 'sentences', 'Title', "new_sent_concept_coverage_pct"]].fillna(0)

## join the txt level new doc extrated probaility with the trained subtopics 

txt_subtopics_extracted_eval_df = pd.merge(txt_subtopics_extracted_df, trained_lda_subtopics_extracted_df,\
         left_on=['txt_subtopics_indx','main_topic'], right_on=['trained_subtopics_indx','main_topic'], how='left').fillna(0)

txt_subtopics_extracted_eval_df.head()


Unnamed: 0,txt_subtopics_indx,txt_subtopics_proba,main_topic,Title,txt_subtopics_count,total_train_subtopic_cnt,subtopic_cv_ratio,trained_subtopics_indx,trained_subtopics_formula,trained_subtopics_count
0,1,0.079715,insect_ant,Children Learn About The Ant,12,20,0.6,1.0,"0.130*""ant"" + 0.029*""also"" + 0.024*""house"" + 0...",20.0
1,4,0.070893,insect_ant,Children Learn About The Ant,12,20,0.6,4.0,"0.050*""like"" + 0.043*""system"" + 0.043*""make"" +...",20.0
2,12,0.082934,insect_ant,Children Learn About The Ant,12,20,0.6,12.0,"0.155*""live"" + 0.141*""ant"" + 0.070*""worker"" + ...",20.0
3,16,0.012956,insect_ant,Children Learn About The Ant,12,20,0.6,0.0,0,0.0
4,18,0.027877,insect_ant,Children Learn About The Ant,12,20,0.6,18.0,"0.094*""know"" + 0.046*""hey"" + 0.039*""time"" + 0....",20.0


In [40]:
### Apply Given Main topics to select the top video

txt_subtopics_extracted_eval_df["new_txt_concept_coverage_pct"] = \
     np.round(txt_subtopics_extracted_eval_df["txt_subtopics_count"].astype(float) / txt_subtopics_extracted_eval_df["trained_subtopics_count"], 4)


# txt_subtopics_extracted_eval_rank = txt_subtopics_extracted_eval_df[["main_topic", "Title",  "subtopic_cv_ratio"]].drop_duplicates()

# txt_subtopics_extracted_eval_rank

txt_subtopics_extracted_eval_df


Unnamed: 0,txt_subtopics_indx,txt_subtopics_proba,main_topic,Title,txt_subtopics_count,total_train_subtopic_cnt,subtopic_cv_ratio,trained_subtopics_indx,trained_subtopics_formula,trained_subtopics_count,new_txt_concept_coverage_pct
0,1,0.079715,insect_ant,Children Learn About The Ant,12,20,0.600000,1.0,"0.130*""ant"" + 0.029*""also"" + 0.024*""house"" + 0...",20.0,0.6000
1,4,0.070893,insect_ant,Children Learn About The Ant,12,20,0.600000,4.0,"0.050*""like"" + 0.043*""system"" + 0.043*""make"" +...",20.0,0.6000
2,12,0.082934,insect_ant,Children Learn About The Ant,12,20,0.600000,12.0,"0.155*""live"" + 0.141*""ant"" + 0.070*""worker"" + ...",20.0,0.6000
3,16,0.012956,insect_ant,Children Learn About The Ant,12,20,0.600000,0.0,0,0.0,inf
4,18,0.027877,insect_ant,Children Learn About The Ant,12,20,0.600000,18.0,"0.094*""know"" + 0.046*""hey"" + 0.039*""time"" + 0....",20.0,0.6000
...,...,...,...,...,...,...,...,...,...,...,...
806,4,0.391782,bird_tailorbird,birds_intro_tailor_bird_v,6,18,0.333333,4.0,"0.065*""bird"" + 0.049*""like"" + 0.028*""egg"" + 0....",18.0,0.3333
807,5,0.030343,bird_tailorbird,birds_intro_tailor_bird_v,6,18,0.333333,5.0,"0.028*""garden"" + 0.028*""paper"" + 0.027*""bird"" ...",18.0,0.3333
808,11,0.042540,bird_tailorbird,birds_intro_tailor_bird_v,6,18,0.333333,11.0,"0.151*""nest"" + 0.087*""built"" + 0.073*""little"" ...",18.0,0.3333
809,14,0.086490,bird_tailorbird,birds_intro_tailor_bird_v,6,18,0.333333,14.0,"0.037*""another"" + 0.035*""mud"" + 0.034*""bird"" +...",18.0,0.3333


In [41]:
txt_subtopics_extracted_eval_df.to_csv("model_output_dataset/txt_subtopics_extracted_eval_df.csv")