# Load dependancies

In [1]:
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

# topic modeling libraries
import pyLDAvis.gensim 

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# supporting libraries
import pandas as pd
import time
import pickle
import topic_modeling_v3 as tm

  from collections import Iterable
  from collections import Mapping
  from numpy.dual import register_func
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).e

# Get FIRST level of topics (LDA)

In [2]:
# load data for LDA
df_data = pd.read_csv("./data/train_grouped.tsv", sep="\t")
    
print(df_data.shape)
df_data.columns

(33982, 16)


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')

In [3]:
df_data['list_of_nouns'] = df_data['list_of_nouns'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_nouns'].head()

0    [rise, economies, march, globalisation, surge,...
1    [pfizer, commitment, responsibility, drugs, ta...
2    [weeks, interest, rates, time, years, world, b...
3    [cruise, lines, wave, months, year, holidays, ...
4    [calendar, year, mood, events, consensus, resp...
Name: list_of_nouns, dtype: object

In [4]:
df_data['list_of_lemmas'] = df_data['list_of_lemmas'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_lemmas'].head()

0    [rise, big, emerging, economies, steady, march...
1    [pfizer, prided, commitment, corporate, social...
2    [weeks, raised, interest, rates, time, years, ...
3    [cruise, lines, brace, wave, months, year, nea...
4    [start, calendar, year, buoyant, mood, caught,...
Name: list_of_lemmas, dtype: object

In [5]:
start_time = time.time()
df_data_1 = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                               params={"TEXT_prepared_df": df_data,
                                                       "save_LDA_dictionary_path": "./output/lda/dictionary1.pickle"
                                                       },
                                        verbose=2)
end_time = time.time()
print("Processing time in minutes:", round((end_time - start_time)/60,2))

loaded data shape: (33982, 16)

Total number of unique Lemmas:  82802

Distribution of lemmas' document counts: 
     count       mean         std  min  50%  55%  65%  75%   85%   95%    97%  \
0  82802.0  26.127642  154.452831  1.0  1.0  2.0  3.0  6.0  17.0  94.0  175.0   

     99%      max  
0  512.0  11676.0  

Deleting too frequent and too rare words...
Lemma count upper bound: 512.0
Lemma count lower bound: 3

List of words for topic modeling dictionary is reduced from 82802 to 26768
LDA dictionary file is saved to: ./output/lda/dictionary1.pickle

Number of texts processed:  33980
Number of extracted lemmas:  26768

Each text is represented by list of  26768  tuples: 
		(lemma's index in bag-of-words dictionary, lemma's term frequency)
Processing time in minutes: 0.09


In [6]:
start_time = time.time()
df_first_level = tm.train_model(model_type="LDA",
                            params={"num_topics": 10,
                                    "LDA_prepared_df": df_data_1,
                                    "LDA_dictionary_path": "./output/lda/dictionary1.pickle",
                                    "save_LDA_model_path": "./output/lda/LDA_model1"
                                    },
                               verbose=2)
end_time = time.time()
print("Processing time in minutes:", round((end_time - start_time)/60,2))

Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
loaded data shape: (33980, 18)

Creating document-term matrix for LDA...

Training LDA model with  10  topics...
LDA model file is saved to: ./output/lda/LDA_model1
Top topic indexes are selected. NOTE "-1" corresponds to top topic with probability < 20%
Processing time in minutes: 1.63


In [7]:
#value count of TOP level topics
df_first_level['first_level_topic'] = df_first_level['top_topic']
df_first_level['first_level_topic_proba'] = df_first_level['top_topic_proba']
df_first_level['first_level_topic'].value_counts().sort_index()

0    1457
1    4999
2    1962
3    3133
4    2096
5    2862
6    4345
7    6634
8    3142
9    3350
Name: first_level_topic, dtype: int64

In [8]:
df_first_level = df_first_level.drop(columns=['selected_words', 'doc2bow',
       'infered_topics', 'top_topic', 'top_topic_proba'])

***
# Get SECOND level topics (LDA)

In [9]:
first_level_topics = list(set(df_first_level['first_level_topic']))
first_level_topics

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
start = time.time()
list_dfs = []
for topic in first_level_topics:
    print("\nSelected topic index:", topic)
    df_topic = df_first_level[df_first_level['first_level_topic'] == topic].copy()
    save_dict_path = "./output/lda/dictionary1_"+str(topic+1)+".pickle"
    save_LDA_model_path = "./output/lda/LDA_model1_" + str(topic + 1)
    
    df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                       params={"TEXT_prepared_df": df_topic,
                                               "save_LDA_dictionary_path": save_dict_path
                                               },
                                       verbose=1)

    df_2nd_tmp = tm.train_model(model_type="LDA",
                                params={"num_topics": 10,
                                        "LDA_prepared_df": df_data_tmp,
                                        "LDA_dictionary_path": save_dict_path,
                                        "save_LDA_model_path": save_LDA_model_path
                                        },
                                verbose=1)

    #value counts of SECOND level topics
    print("\nValue counts of SECOND level topics:")
    df_2nd_tmp['second_level_topic'] = df_2nd_tmp['top_topic']
    df_2nd_tmp['second_level_topic_proba'] = df_2nd_tmp['top_topic_proba']
    print(df_2nd_tmp['second_level_topic'].value_counts().sort_index())

    print("#"*50)
    df_2nd_tmp = df_2nd_tmp.drop(columns=['selected_words', 'doc2bow',
                                           'infered_topics', 'top_topic', 'top_topic_proba'])
    list_dfs.append(df_2nd_tmp)
finish = time.time()


Selected topic index: 0
loaded data shape: (1457, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1

Value counts of SECOND level topics:
0     69
1    128
2    108
3    106
4    197
5    135
6    133
7    303
8    151
9    127
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 1
loaded data shape: (4999, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_2

Value counts of SECOND level topics:
0    286
1    987
2    234
3    755
4    466
5    344
6    422
7    551
8    491
9    463
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 2
loaded data shape: (1962, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_3

Value counts of SECOND level topics:
0    139
1    22

In [11]:
print("Time of gettig Second level topics in minutes:", round((finish-start)/60,2))
df_second_level = pd.concat(list_dfs)
df_second_level.columns

Time of gettig Second level topics in minutes: 7.4


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba'],
      dtype='object')

***
# Get THIRD level topics

In [12]:
df_second_level[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba']].describe()

Unnamed: 0,first_level_topic,first_level_topic_proba,second_level_topic,second_level_topic_proba
count,33979.0,33979.0,33979.0,33979.0
mean,4.967863,0.678492,4.611966,0.679404
std,2.773743,0.195395,2.782028,0.207137
min,0.0,0.213856,0.0,0.211337
25%,3.0,0.519648,2.0,0.511915
50%,6.0,0.659102,5.0,0.648166
75%,7.0,0.849835,7.0,0.879576
max,9.0,0.990098,9.0,0.990106


In [13]:
start = time.time()
list_dfs = []

for topic_1st in first_level_topics:
    print("\nSelected FIRST level topic index:",topic_1st)
    df_1st_tmp = df_second_level[df_second_level['first_level_topic'] == topic_1st].copy()
    second_level_topics = list(set(df_1st_tmp['second_level_topic']))
    print("second_level_topics", second_level_topics)
    
    for topic_2nd in second_level_topics:
        print("\nSelected topics' indexes:", (topic_1st, topic_2nd))
        
        save_dict_path = "./output/lda/dictionary1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)+".pickle"
        save_LDA_model_path = "./output/lda/LDA_model1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)
        
        df_2nd_tmp = df_1st_tmp[df_1st_tmp['second_level_topic'] == topic_2nd].copy()
        
        df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                           params={"TEXT_prepared_df": df_2nd_tmp,
                                                   "save_LDA_dictionary_path": save_dict_path
                                                   },
                                           verbose=1)

        df_3d_tmp = tm.train_model(model_type="LDA",
                                    params={"num_topics": 10,
                                            "LDA_prepared_df": df_data_tmp,
                                            "LDA_dictionary_path": save_dict_path,
                                            "save_LDA_model_path": save_LDA_model_path,
                                            },
                                    verbose=1)

        #value counts of SECOND level topics
        print("\nValue counts of SECOND level topics:")
        df_3d_tmp['third_level_topic'] = df_3d_tmp['top_topic']
        df_3d_tmp['third_level_topic_proba'] = df_3d_tmp['top_topic_proba']
        print(df_3d_tmp['second_level_topic'].value_counts().sort_index())

        print("#"*50)
        df_3d_tmp = df_3d_tmp.drop(columns=['selected_words', 'doc2bow',
                                               'infered_topics', 'top_topic', 'top_topic_proba'])
        list_dfs.append(df_3d_tmp)
finish = time.time()


Selected FIRST level topic index: 0
second_level_topics [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Selected topics' indexes: (0, 0)
loaded data shape: (69, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_1

Value counts of SECOND level topics:
0    69
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 1)
loaded data shape: (128, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_2

Value counts of SECOND level topics:
1    128
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 2)
loaded data shape: (108, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_3

Value counts of SECOND level topics:
2    108
Name: second_level_topic, dtype: int64
##############

In [14]:
print("Time of gettig Third level topics in minutes:", round((finish-start)/60,2))
df_third_level = pd.concat(list_dfs)
df_third_level.columns

Time of gettig Third level topics in minutes: 7.65


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba'],
      dtype='object')

# Evaluate 

In [15]:
df_result = df_third_level.copy()
df_result[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
first_level_topic,33979.0,4.967863,2.773743,0.0,3.0,6.0,7.0,9.0
first_level_topic_proba,33979.0,0.678492,0.195395,0.213856,0.519648,0.659102,0.849835,0.990098
second_level_topic,33979.0,4.611966,2.782028,0.0,2.0,5.0,7.0,9.0
second_level_topic_proba,33979.0,0.679404,0.207137,0.211337,0.511915,0.648166,0.879576,0.990106
third_level_topic,33979.0,4.636393,2.855418,0.0,2.0,5.0,7.0,9.0
third_level_topic_proba,33979.0,0.831588,0.198112,0.244726,0.651033,0.963989,0.977494,0.990321


In [16]:
df_result['second_level_topic'] = df_result['first_level_topic'].apply(str) + "." +\
                                  df_result['second_level_topic'].apply(str)
df_result['third_level_topic'] = df_result['second_level_topic'].apply(str) + "." +\
                                  df_result['third_level_topic'].apply(str)
df_result[['second_level_topic','third_level_topic']].iloc[::1000].head()

Unnamed: 0,second_level_topic,third_level_topic
289,0.0,0.0.2
16599,0.7,0.7.0
1506,1.1,1.1.7
9187,1.3,1.3.8
21896,1.4,1.4.2


In [17]:
print("Number of first level clusters per publication section:")
pd.DataFrame(df_result.groupby('section')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per publication section:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,14.0,8.357143,3.387923,1.0,2.2,10.0,10.0,10.0,10.0,10.0


In [18]:
df_result['section'].value_counts()

health                   8237
business                 6930
culture                  3246
science                  2910
tech                     2527
gear                     2108
security                 1840
transportation           1666
finance-and-economics    1648
Space                    1641
Health                   1193
movies                     31
style                       1
music                       1
Name: section, dtype: int64

In [19]:
#test single section
section = "health"
df_result[df_result['section'] == section][['first_level_topic',
                                            'second_level_topic',
                                            'third_level_topic']].describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])

Unnamed: 0,first_level_topic
count,8237.0
mean,4.748938
std,2.110383
min,0.0
10%,2.0
25%,4.0
50%,5.0
75%,6.0
90%,7.0
max,9.0


In [20]:
print("Number of first level clusters per 30% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_1')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per 30% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,571.0,3.565674,2.878318,1.0,1.0,1.0,2.0,5.0,9.0,10.0


In [21]:
print("Number of second level clusters per 50% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_2')['second_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of second level clusters per 50% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
second_level_topic,6767.0,2.6545,4.360259,1.0,1.0,1.0,1.0,2.0,5.0,76.0


In [22]:
print("Number of third level clusters per 70% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_3')['third_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of third level clusters per 70% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
third_level_topic,22065.0,1.390075,1.621997,1.0,1.0,1.0,1.0,1.0,2.0,58.0


# Name Topics (as a most frequent noun in the cluster)

In [23]:
df = df_result.copy()
df['first_level_topic_name'] = tm.get_topic_names(df_result, 'first_level_topic', 'list_of_nouns')
df['second_level_topic_name'] = tm.get_topic_names(df_result, 'second_level_topic', 'list_of_nouns')
df['third_level_topic_name'] = tm.get_topic_names(df_result, 'third_level_topic', 'list_of_nouns')
df[['publication', 
    'section',
    'first_level_topic','first_level_topic_name',
    'second_level_topic','second_level_topic_name',
    'third_level_topic', 'third_level_topic_name'
   ]].iloc[::1000].head(10).T

Unnamed: 0,289,16599,1506,9187,21896,1674,227,11434,31181,15226
publication,Economist,Wired,Economist,CNN,Wired,Economist,Economist,CNN,Wired,Gizmodo
section,business,gear,finance-and-economics,tech,security,business,business,health,culture,Space
first_level_topic,0,0,1,1,1,1,1,2,2,3
first_level_topic_name,People,People,Year,Year,Year,Year,Year,Years,Years,People
second_level_topic,0.0,0.7,1.1,1.3,1.4,1.7,1.9,2.3,2.7,3.2
second_level_topic_name,Prices,Company,Economy,Hackers,Companies,Ads,Year,Years,Carbon,Stores
third_level_topic,0.0.2,0.7.0,1.1.7,1.3.8,1.4.2,1.7.2,1.9.7,2.3.9,2.7.0,3.2.9
third_level_topic_name,Products,Protein,Year,Week,Government,Ads,Company,Particles,Emissions,Year


In [24]:
df_topics = df[[
    'first_level_topic','first_level_topic_name',
    'second_level_topic','second_level_topic_name',
    'third_level_topic', 'third_level_topic_name'
   ]].copy()
df_topics = df_topics.drop_duplicates()
print(df_topics.shape)
df_topics.head().T

(1000, 6)


Unnamed: 0,289,448,1319,2344,2772
first_level_topic,0,0,0,0,0
first_level_topic_name,People,People,People,People,People
second_level_topic,0.0,0.0,0.0,0.0,0.0
second_level_topic_name,Prices,Prices,Prices,Prices,Prices
third_level_topic,0.0.2,0.0.6,0.0.4,0.0.9,0.0.1
third_level_topic_name,Products,Bill,Prices,State,Cannabis


In [25]:
with open('./output/lda/topics.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_topics, f, pickle.HIGHEST_PROTOCOL)

# Process unseen text

In [26]:
ind = 10

text = df['first_10_sents'].iloc[ind]
text

' CNNA new poll finds that many Americans are not optimistic about what the GOP health care bill will do to their coverage.   Although President Donald Trump and Republicans in Congress promise that their plan will cover more people and cost less, nearly half of Americans dont believe it, according to a Kaiser Family Foundation poll released Wednesday.   And while many Republicans are confident that defunding Planned Parenthood is the right move, the greater majority oppose that idea, it says. This is the latest poll on the legislation from the nonpartisan Kaiser Family Foundation.   The organization polled Americans from March 612, before the nonpartisan Congressional Budget Office released an estimate Monday predicting that  unlike Obamacare, which brought the uninsured rate to an alltime low  the GOP bill will send the countrys uninsured rate higher than before the Affordable Care Act.   That change could be immediate for 14 million Americans who could become uninsured next year, ac

In [27]:
tm.predict_topics(text,
                  params={"topics_df_path": './output/lda/topics.pickle',
                          "first_dictionary_path": "./output/lda/dictionary1.pickle" ,
                          "first_LDA_model_path": "./output/lda/LDA_model1"
                         }
              )  

{'first_level_topic': 0,
 'first_level_topic_name': 'People',
 'first_level_topic_proba': 0.65901077,
 'second_level_topic': 0,
 'second_level_topic_name': 'Prices',
 'second_level_topic_proba': 0.98392165,
 'third_level_topic': 5,
 'third_level_topic_name': 'Prices',
 'third_level_topic_proba': 0.9590873}

In [28]:
df[[
    'first_level_topic','first_level_topic_name', 'first_level_topic_proba',
    'second_level_topic','second_level_topic_name','second_level_topic_proba',
    'third_level_topic', 'third_level_topic_name', 'third_level_topic_proba',
   ]].iloc[ind]

first_level_topic                  0
first_level_topic_name        People
first_level_topic_proba     0.669685
second_level_topic               0.0
second_level_topic_name       Prices
second_level_topic_proba    0.983922
third_level_topic              0.0.5
third_level_topic_name        Prices
third_level_topic_proba     0.959087
Name: 5299, dtype: object