# Load dependancies

In [1]:
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

# topic modeling libraries
import pyLDAvis.gensim 

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# supporting libraries
import pandas as pd
import time
import topic_modeling_v1 as tm

  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  from collections import namedtuple, defaultdict, Iterable


# Get FIRST level of topics (LDA)

In [2]:
# load data for LDA
df_data = pd.read_csv("./data/train_grouped.tsv", sep="\t")
    
print(df_data.shape)
df_data.columns

(10000, 13)


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')

In [3]:
df_data['list_of_nouns'] = df_data['list_of_nouns'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_nouns'].head()

0    [foodies, rsums, kitchen, porter, job, dream, ...
1    [giraffe, wildfires, animal, rights, critics, ...
2    [film, shoes, tales, film, feel, movie, legacy...
3    [treat, store, stores, video, rental, chain, s...
4    [celebrity, manes, inspiration, boards, star, ...
Name: list_of_nouns, dtype: object

In [4]:
df_data['list_of_lemmas'] = df_data['list_of_lemmas'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_lemmas'].head()

0    [calling, foodies, royal, rsums, ready, lookin...
1    [giraffe, seemingly, safe, wildfires, continue...
2    [signed, direct, upcoming, film, knew, big, sh...
3    [special, treat, store, stores, popular, video...
4    [mostshared, celebrity, manes, according, insp...
Name: list_of_lemmas, dtype: object

In [5]:
df_data_1 = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                               params={"TEXT_prepared_df": df_data,
                                                       "save_LDA_dictionary_path": "./output/dictionary1.pickle"
                                                       },
                                        verbose=2)

loaded data shape: (10000, 13)

Total number of unique Lemmas:  40765

Distribution of lemmas' document counts: 
     count       mean        std  min  50%  55%  65%  75%   85%   95%   97%  \
0  40765.0  14.398626  72.225598  1.0  1.0  2.0  3.0  5.0  12.0  54.0  94.0   

     99%     max  
0  252.0  4383.0  

Deleting too frequent and too rare words...
Lemma count upper bound: 252.0
Lemma count lower bound: 3

List of words for topic modeling dictionary is reduced from 40765 to 12070
LDA dictionary file is saved to: ./output/dictionary1.pickle

Number of texts processed:  10000
Number of extracted lemmas:  12070

Each text is represented by list of  12070  tuples: 
		(lemma's index in bag-of-words dictionary, lemma's term frequency)


In [6]:
df_first_level = tm.train_model(model_type="LDA",
                            params={"num_topics": 10,
                                    "LDA_prepared_df": df_data_1,
                                    "LDA_dictionary_path": "./output/dictionary1.pickle",
                                    "save_LDA_model_path": "./output/LDA_model1"
                                    },
                               verbose=2)

loaded data shape: (10000, 15)

Creating document-term matrix for LDA...

Training LDA model with  10  topics...
LDA model file is saved to: ./output/LDA_model1
Top topic indexes are selected. NOTE "-1" corresponds to top topic with probability < 20%


In [7]:
#value count of TOP level topics
df_first_level['first_level_topic'] = df_first_level['top_topic']
df_first_level['first_level_topic_proba'] = df_first_level['top_topic_proba']
df_first_level['first_level_topic'].value_counts().sort_index()

0     750
1    1026
2     460
3    1572
4    1247
5    1305
6     623
7    1159
8     720
9    1138
Name: first_level_topic, dtype: int64

In [8]:
df_first_level = df_first_level.drop(columns=['selected_words', 'doc2bow',
       'infered_topics', 'top_topic', 'top_topic_proba'])

***
# Get SECOND level topics (LDA)

In [9]:
first_level_topics = list(set(df_first_level['first_level_topic']))
first_level_topics

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
start = time.time()
list_dfs = []
for topic in first_level_topics:
    print("\nSelected topic index:", topic)
    df_topic = df_first_level[df_first_level['first_level_topic'] == topic].copy()
    save_dict_path = "./output/dictionary1_"+str(topic+1)+".pickle"
    save_LDA_model_path = "./output/LDA_model1_" + str(topic + 1)
    
    df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                       params={"TEXT_prepared_df": df_topic,
                                               "save_LDA_dictionary_path": save_dict_path
                                               },
                                       verbose=1)

    df_2nd_tmp = tm.train_model(model_type="LDA",
                                params={"num_topics": 10,
                                        "LDA_prepared_df": df_data_tmp,
                                        "LDA_dictionary_path": save_dict_path,
                                        "save_LDA_model_path": save_LDA_model_path
                                        },
                                verbose=1)

    #value counts of SECOND level topics
    print("\nValue counts of SECOND level topics:")
    df_2nd_tmp['second_level_topic'] = df_2nd_tmp['top_topic']
    df_2nd_tmp['second_level_topic_proba'] = df_2nd_tmp['top_topic_proba']
    print(df_2nd_tmp['second_level_topic'].value_counts().sort_index())

    print("#"*50)
    df_2nd_tmp = df_2nd_tmp.drop(columns=['selected_words', 'doc2bow',
                                           'infered_topics', 'top_topic', 'top_topic_proba'])
    list_dfs.append(df_2nd_tmp)
finish = time.time()


Selected topic index: 0
loaded data shape: (750, 15)
LDA model file is saved to: ./output/LDA_model1_1

Value counts of SECOND level topics:
0    88
1    64
2    86
3    64
4    49
5    87
6    96
7    69
8    69
9    78
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 1
loaded data shape: (1026, 15)
LDA model file is saved to: ./output/LDA_model1_2

Value counts of SECOND level topics:
0     84
1    110
2    135
3     68
4    125
5    102
6    130
7     55
8    136
9     81
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 2
loaded data shape: (460, 15)
LDA model file is saved to: ./output/LDA_model1_3

Value counts of SECOND level topics:
0    46
1    47
2    52
3    49
4    52
5    44
6    54
7    39
8    50
9    27
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 3
loaded data shape: (1572, 15

In [11]:
print("Time of gettig Second level topics in minutes:", round((finish-start)/60,2))
df_second_level = pd.concat(list_dfs)
df_second_level.columns

Time of gettig Second level topics in minutes: 2.53


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba'],
      dtype='object')

***
# Get THIRD level topics

In [12]:
df_second_level[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba']].describe()

Unnamed: 0,first_level_topic,first_level_topic_proba,second_level_topic,second_level_topic_proba
count,10000.0,10000.0,10000.0,10000.0
mean,4.6028,0.654582,4.5207,0.777007
std,2.742587,0.196565,2.826017,0.213895
min,0.0,0.217111,0.0,0.253665
25%,3.0,0.499697,2.0,0.571962
50%,4.0,0.625022,5.0,0.862729
75%,7.0,0.812681,7.0,0.975666
max,9.0,0.990516,9.0,0.988883


In [13]:
start = time.time()
list_dfs = []

for topic_1st in first_level_topics:
    print("\nSelected FIRST level topic index:",topic_1st)
    df_1st_tmp = df_second_level[df_second_level['first_level_topic'] == topic_1st].copy()
    second_level_topics = list(set(df_1st_tmp['second_level_topic']))
    print("second_level_topics", second_level_topics)
    
    for topic_2nd in second_level_topics:
        print("\nSelected topics' indexes:", (topic_1st, topic_2nd))
        
        save_dict_path = "./output/dictionary1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)+".pickle"
        save_LDA_model_path = "./output/LDA_model1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)
        
        df_2nd_tmp = df_1st_tmp[df_1st_tmp['second_level_topic'] == topic_2nd].copy()
        
        df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                           params={"TEXT_prepared_df": df_2nd_tmp,
                                                   "save_LDA_dictionary_path": save_dict_path
                                                   },
                                           verbose=1)

        df_3d_tmp = tm.train_model(model_type="LDA",
                                    params={"num_topics": 10,
                                            "LDA_prepared_df": df_data_tmp,
                                            "LDA_dictionary_path": save_dict_path,
                                            "save_LDA_model_path": save_LDA_model_path,
                                            },
                                    verbose=1)

        #value counts of SECOND level topics
        print("\nValue counts of SECOND level topics:")
        df_3d_tmp['third_level_topic'] = df_3d_tmp['top_topic']
        df_3d_tmp['third_level_topic_proba'] = df_3d_tmp['top_topic_proba']
        print(df_3d_tmp['second_level_topic'].value_counts().sort_index())

        print("#"*50)
        df_3d_tmp = df_3d_tmp.drop(columns=['selected_words', 'doc2bow',
                                               'infered_topics', 'top_topic', 'top_topic_proba'])
        list_dfs.append(df_3d_tmp)
finish = time.time()


Selected FIRST level topic index: 0
second_level_topics [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Selected topics' indexes: (0, 0)
loaded data shape: (88, 17)
LDA model file is saved to: ./output/LDA_model1_1_1

Value counts of SECOND level topics:
0    88
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 1)
loaded data shape: (64, 17)
LDA model file is saved to: ./output/LDA_model1_1_2

Value counts of SECOND level topics:
1    64
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 2)
loaded data shape: (86, 17)
LDA model file is saved to: ./output/LDA_model1_1_3

Value counts of SECOND level topics:
2    86
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 3)
loaded data shape: (64, 17)
LDA model file is saved to: ./output/LDA_model1_1_4

Value counts of SECOND level topics:
3    64
Na

LDA model file is saved to: ./output/LDA_model1_4_3

Value counts of SECOND level topics:
2    142
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (3, 3)
loaded data shape: (144, 17)
LDA model file is saved to: ./output/LDA_model1_4_4

Value counts of SECOND level topics:
3    144
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (3, 4)
loaded data shape: (125, 17)
LDA model file is saved to: ./output/LDA_model1_4_5

Value counts of SECOND level topics:
4    125
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (3, 5)
loaded data shape: (174, 17)
LDA model file is saved to: ./output/LDA_model1_4_6

Value counts of SECOND level topics:
5    174
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (3, 6)
loaded data shape: (1

LDA model file is saved to: ./output/LDA_model1_7_5

Value counts of SECOND level topics:
4    61
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (6, 5)
loaded data shape: (55, 17)
LDA model file is saved to: ./output/LDA_model1_7_6

Value counts of SECOND level topics:
5    55
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (6, 6)
loaded data shape: (58, 17)
LDA model file is saved to: ./output/LDA_model1_7_7

Value counts of SECOND level topics:
6    58
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (6, 7)
loaded data shape: (70, 17)
LDA model file is saved to: ./output/LDA_model1_7_8

Value counts of SECOND level topics:
7    70
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (6, 8)
loaded data shape: (78, 17)


LDA model file is saved to: ./output/LDA_model1_10_7

Value counts of SECOND level topics:
6    123
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (9, 7)
loaded data shape: (100, 17)
LDA model file is saved to: ./output/LDA_model1_10_8

Value counts of SECOND level topics:
7    100
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (9, 8)
loaded data shape: (130, 17)
LDA model file is saved to: ./output/LDA_model1_10_9

Value counts of SECOND level topics:
8    130
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (9, 9)
loaded data shape: (125, 17)
LDA model file is saved to: ./output/LDA_model1_10_10

Value counts of SECOND level topics:
9    125
Name: second_level_topic, dtype: int64
##################################################


In [14]:
print("Time of gettig Third level topics in minutes:", round((finish-start)/60,2))
df_third_level = pd.concat(list_dfs)
df_third_level.columns

Time of gettig Third level topics in minutes: 2.05


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba'],
      dtype='object')

# Evaluate 

In [15]:
df_result = df_third_level.copy()
df_result[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
first_level_topic,10000.0,4.6028,2.742587,0.0,3.0,4.0,7.0,9.0
first_level_topic_proba,10000.0,0.654582,0.196565,0.217111,0.499697,0.625022,0.812681,0.990516
second_level_topic,10000.0,4.5207,2.826017,0.0,2.0,5.0,7.0,9.0
second_level_topic_proba,10000.0,0.777007,0.213895,0.253665,0.571962,0.862729,0.975666,0.988883
third_level_topic,10000.0,4.457,2.925817,0.0,2.0,4.0,7.0,9.0
third_level_topic_proba,10000.0,0.881597,0.158032,0.255436,0.918165,0.959082,0.969994,0.986564


In [16]:
df_result['second_level_topic'] = df_result['first_level_topic'].apply(str) + "." +\
                                  df_result['second_level_topic'].apply(str)
df_result['third_level_topic'] = df_result['second_level_topic'].apply(str) + "." +\
                                  df_result['third_level_topic'].apply(str)
df_result[['second_level_topic','third_level_topic']].iloc[::1000].head()

Unnamed: 0,second_level_topic,third_level_topic
99,0.0,0.0.1
4905,1.2,1.2.8
5474,2.4,2.4.3
9361,3.4,3.4.7
7326,4.1,4.1.5


In [17]:
print("Number of first level clusters per publication section:")
pd.DataFrame(df_result.groupby('section')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per publication section:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,29.0,8.0,3.058945,1.0,2.8,7.0,10.0,10.0,10.0,10.0


In [18]:
print("Number of first level clusters per 30% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_1')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per 30% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,172.0,3.796512,3.141756,1.0,1.0,1.0,2.0,6.0,9.9,10.0


In [19]:
print("Number of second level clusters per 50% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_2')['second_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of second level clusters per 50% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
second_level_topic,2540.0,2.862205,5.07136,1.0,1.0,1.0,1.0,3.0,6.0,88.0


In [20]:
print("Number of third level clusters per 70% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_3')['third_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of third level clusters per 70% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
third_level_topic,7860.0,1.248982,1.307322,1.0,1.0,1.0,1.0,1.0,2.0,49.0


# Name Topics (as a most frequent noun in the cluster)

In [24]:
df = tm.get_topic_names(df_result, 'first_level_topic', 'list_of_nouns')
df['second_level_topic'] = tm.get_topic_names(df_result, 
                                              'second_level_topic', 'list_of_nouns')['second_level_topic']
df['third_level_topic'] = tm.get_topic_names(df_result, 
                                             'third_level_topic', 'list_of_nouns')['third_level_topic']
df[['publication', 
    'section',
    'first_level_topic',
    'second_level_topic',
    'third_level_topic'
   ]].iloc[::1000].head(10).T

Unnamed: 0,99,4905,5474,9361,7326,5279,6320,1413,1416,8170
publication,People,People,People,People,People,People,People,People,People,People
section,music,tv,royals,celebrity,tv,celebrity,crime,politics,music,tv
first_level_topic,0 TIME,1 TIME,2 TIME,3 PHOTO,4 TIME,4 TIME,5 POLICE,7 TIME,7 TIME,9 TIME
second_level_topic,0.0 LAWSUIT,1.2 MOM,2.4 FAMILY,3.4 BABY,4.1 RING,4.9 FILM,5.6 POLICE,7.0 TIME,7.9 TIME,9.1 TIME
third_level_topic,0.0.1 MEN,1.2.8 MONTHS,2.4.3 COUPLE,3.4.7 SHOPPING,4.1.5 ACTRESS,4.9.8 MOVIE,5.6.3 PEOPLE,7.0.3 STUDENTS,7.9.3 TIME,9.1.8 MOM


# NEXT:
1. delete lemmas that present in 90% of topics before selecting topic name
2. increase train data size
3. move notebooks to AWS
4. rerun notebooks on AWS