In [1]:
#% matplotlib inline

import pickle
import gensim
import os
import matplotlib.pylab as plt
import numpy as np
import scipy
import re
import pandas as pd
import seaborn as sb



In [2]:
_MODELS_DIR = "saved_models/"
key="Y02E_10_20"
m1 ="DTM_model"
m2 ="DIM_model"
dict_file = "{}.dict".format(key)
corpus_file = "{}.mm".format(key)

In [3]:
def topic_frames(topics,n_tops):
    # topics = lda.print_topics(topics=20, times=46, topn=10)
    topic_dfs = {}
    # separate topics out from list
    for i in range(n_tops):
        topic = topics[i::n_tops]
        n_words = len(topic[0].split("+"))
        n_slices = len(topic)
        temp_top = " ".join(topic)
        words = [k for k in re.split("[^a-zA-Z]*", temp_top) if k!= '']
        coeffs = re.findall(r"[-+]?\d+[\.]?\d*", temp_top)
        Z = zip(coeffs,words)
        # get all unique words in topic
        columns = np.unique(words)
        Z = [Z[k:k+n_words] for k in xrange(0, len(Z), n_words)]
        df = []
        for j in range(len(Z)):
            temp = []
            row = np.array(Z[j])[:,1]
            for word in columns:
                if word in row:
                    # append its associated number
                    idx = np.where(row == word)[0][0]
                    temp.append(float(np.array(Z[j])[idx,0]))
                else:
                    # append NaN
                    temp.append(np.NaN)
            df.append(temp)
        topic_dfs["topic_"+str(i+1)] = pd.DataFrame(df)
        topic_dfs["topic_"+str(i+1)].columns = columns
    return topic_dfs

def plot_topics(topics):
    lines = str(topics).strip("[()").strip("]\n").split("(")
    lines = lines[0].strip("),").strip("u'").split(", u'")
    # plotting word clouds of each topic
    curr_topic = 0
    for j, line in enumerate(lines):
        scores = [float(x.split("*")[0]) for x in line.split(" + ")]
        words = [x.split("*")[1].strip("'), ") for x in line.split(" + ")]
        freqs = []
        for word, score in zip(words, scores):
            freqs.append((word, score))
        wc = WordCloud(max_words=100)
        elements = wc.fit_words(freqs)
        default_colors = wc.to_array()
        plt.figure()
        plt.title("Topic {}".format(j))#classes[j])
        plt.imshow(default_colors)
        plt.axis("off")
        plt.show()
        curr_topic += 1

In [4]:
# get corpus
corpus = gensim.corpora.MmCorpus(os.path.join(_MODELS_DIR, corpus_file))

# get dictionary
dictionary = gensim.corpora.Dictionary.load(os.path.join(_MODELS_DIR, dict_file))

# get DTM model
filehandler = open(_MODELS_DIR + m1 + ".obj",'r')
DTM = pickle.load(filehandler)
filehandler.close()

# get DIM model
filehandler = open(_MODELS_DIR + m2 + ".obj",'r')
DIM = pickle.load(filehandler)
filehandler.close()

## Vizualise DTM topics over time

In [5]:
topics = DTM.show_topics(topics=7, times=46, topn=10)

In [6]:
topic_dfs = topic_frames(topics, n_tops = 7)
topic_dfs

{'topic_1':     arrange  bearing   body  connect  cover    end  guide    low   main  \
 0       NaN      NaN    NaN      NaN  0.020    NaN  0.021  0.013  0.016   
 1       NaN      NaN    NaN      NaN  0.020    NaN  0.021  0.013  0.016   
 2       NaN      NaN    NaN      NaN  0.020    NaN  0.021  0.013  0.017   
 3       NaN      NaN    NaN      NaN  0.020    NaN  0.021  0.013  0.017   
 4       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.013  0.017   
 5       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.013  0.017   
 6       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.014  0.017   
 7       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.014  0.016   
 8       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.014  0.015   
 9       NaN      NaN    NaN      NaN  0.019    NaN  0.021  0.015  0.015   
 10      NaN      NaN    NaN      NaN  0.018    NaN  0.020  0.015  0.016   
 11      NaN      NaN    NaN      NaN  0.018    NaN  0.020  0.015  0.016   
 

In [18]:
temp = topic_dfs["topic_6"].interpolate(method='linear', axis=0, limit=4).dropna(axis=1)
# smooth it before plotting
for col in temp.columns.values:
    temp[col] = scipy.signal.savgol_filter(temp[col],5,3)

#pl = ["#11c638", "#8dd593", "#d5eae7", "#f3e1eb", "#f6c4e1", "#c6dec7",
#      "#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", 
#      "#4a6fe3", "#8595e1", "#b5bbe3", "#e6afb9", "#e07b91", "#d33f6a", 
#      "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6", "#f79cd4"]
#temp_pl = sb.color_palette("RdYlGn", 10)

temp.index = times
temp.index = temp.index.to_datetime()
pl = ["#779ECB","#77DD77","#FF6961","#FDFD96","#B19CD9","#FFB347","#FFD1DC","#C23B22","#AEC6CF","#03C03C"]
sb.set_palette(pl)
temp.plot(figsize=(8,6))
plt.xlabel("Time (years)")
plt.ylabel("Word Weight")
plt.title("Word Weights Through Time for Topic 6")
plt.show()

In [None]:
temp.head()

In [41]:
topic = topics[0::7]

In [123]:
temp.head()

Unnamed: 0,air,chamber,pipe,pressure,pump,tank,valve,water
1969,0.024957,0.019957,0.023957,0.033,0.015957,0.026,0.016,0.095
1970,0.025171,0.020171,0.024171,0.033,0.016171,0.026,0.016,0.095
1971,0.025743,0.020743,0.024743,0.033,0.015743,0.026,0.016,0.095
1972,0.026,0.021171,0.025086,0.033,0.015257,0.026,0.016086,0.095086
1973,0.026171,0.020743,0.024914,0.033,0.014914,0.026,0.015743,0.094743


In [14]:
def get_time_seq(data_file, min_slice_size=None):
    df =  pd.read_csv(data_file)
    # Create dummy column
    df["Y"] = pd.DatetimeIndex(df["appln_filing_date"]).to_period("A")
    # group by dummy column
    groups = df.groupby("Y")
    # return sorted df and counts dict
    df = df.sort_values("appln_filing_date")
    approved_ids = None
    if min_slice_size == None:
        # count members of each group
        counts = np.sort([[key,len(groups.groups[key])] for key in groups.groups.keys()], axis=0)
        time_seq = list(counts[:,1])
    else:
        approved_ids = []
        for group in groups.groups.iteritems():
            if len(group[1]) >= min_slice_size:
                approved_ids.append(df.loc[group[1]]["appln_id"].values[:min_slice_size])
                
        time_seq = [min_slice_size]*len(approved_ids)
    return time_seq, approved_ids, np.unique(df["Y"])

In [15]:
data_file = '../Data/{}.csv'.format(key)
time_seq, approved_ids, times = get_time_seq(data_file)

In [114]:
len(zip(time_seq, times)[14:])

32