In [None]:
import functools
import operator
import string

import joblib
import textblob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline

In [None]:
sns.set(style="darkgrid")

In [None]:
X = pd.read_csv('quora_challenge.csv')

In [None]:
X['question_text'].str.len().describe()

In [None]:
X['question_len'] = X['question_text'].str.len()

fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
fig.suptitle('Question Length Distribution', fontsize=16)

sns.violinplot(X['question_len'], inner='quartile', orient='h', ax=ax0)
ax0.set_xlabel('Character Length')

sns.violinplot(X['question_len'], inner='quartile', orient='h', ax=ax1)
ax1.set_xscale('log')
ax1.set_xlabel('Character Length (Log Scale)');

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, *, target_col='question_text', result_col='cleaned_text', pos='anvr', stop_words=None):
        self.target_col = str(target_col)
        self.result_col = str(result_col)
        self.pos = tuple(pos.lower()) if isinstance(pos, str) else tuple(pos)
        self.stop_words = () if stop_words is None else frozenset(stop_words)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.result_col] = (X[self.target_col].str.lower()
                                                .map(self.asciitize)
                                                .map(self.depunctuate)
                                                .map(self.lemmatize))
        return X[self.result_col].values
        
    @staticmethod
    def asciitize(text):
        return ''.join(char for char in text if char in string.printable)

    @staticmethod
    def depunctuate(text):
        return ''.join(char if char not in string.punctuation else ' ' for char in text)
    
    def lemmatize(self, text):
        tag_dict = dict(J='a', N='n', V='v', R='r')
        blob = textblob.TextBlob(text)
        
        try:
            words, tags = zip(*blob.pos_tags)
        except ValueError:
            return ''
        
        tags = (tag_dict.get(tag[0]) for tag in tags)
        lemmas = (word.lemmatize(tag) for word, tag in zip(words, tags)
                  if tag in self.pos
                  if word not in self.stop_words)
        result = ' '.join(lem for lem in lemmas if lem not in self.stop_words)
        return result if result else ' '

In [None]:
class TopicLabeller(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        topics = np.argmax(X, axis=1)
        weights = np.max(X, axis=1)
        return np.vstack([topics, weights]).T

In [None]:
EXTRA_STOP_WORDS = frozenset("does doesn doesnt don dont im ive make quora really shouldnt youll ve weve wouldnt".split())
NOT_STOP_WORDS = frozenset("cry system".split())
CUSTOM_STOP_WORDS = ENGLISH_STOP_WORDS | EXTRA_STOP_WORDS - NOT_STOP_WORDS

In [None]:
vector_kwds = dict(
    ngram_range=(1, 2),
    stop_words=None,
    lowercase=False,
    max_df=0.9,
    max_features=25_000,
)

decomp_kwds = dict(
    n_components=50,
    random_state=0,
)

topic_model_pipe = Pipeline([
    ('textprep', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(**vector_kwds)),
    ('decomposer', NMF(**decomp_kwds)),
    ('labeller', TopicLabeller()),
], verbose=True)

In [None]:
X_new = topic_model_pipe.fit_transform(X)

file_prefix = ('_'.join(type(step).__name__ for step in topic_model_pipe.named_steps.values()).lower()
               + f'_topics{topic_model_pipe["decomposer"].n_components}'
               + f'_mxfeat{topic_model_pipe["vectorizer"].max_features}'
              )
print(file_prefix)

# joblib.dump(topic_model_pipe, f'{file_prefix}__pipeline.joblib')
# joblib.dump(X_new, f'{file_prefix}__X_new.joblib');

In [None]:
X['topic'] = X_new[:, 0]
X['weight'] = X_new[:, 1]
X

In [None]:
def extract_top_words(pipeline, n_top=8):
    feature_names = pipeline['vectorizer'].get_feature_names()
    pyfunc = functools.partial(operator.getitem, feature_names)
    vfunc = np.vectorize(pyfunc)

    components = pipeline['decomposer'].components_
    word_idxs = components.argsort(axis=1)[:, -n_top:][:, ::-1]
    words = vfunc(word_idxs)

    top_words_df = pd.DataFrame(words.T, columns=[f'Topic {i}' for i in range(len(words))])
    top_words_df.index.name = 'Top Words'
    
    return top_words_df

top_words_df = extract_top_words(topic_model_pipe)
top_words_df

# joblib.dump(top_words_df, f'{file_prefix}__top_words.joblib')
# top_words_df.to_csv(f'file_prefix}__top_words.csv')

In [None]:
# Get topic count distribution
topics = X['topic']
x, y = np.vstack(np.unique(topics, return_counts=True))

plt.figure(figsize=(15, 5))
sns.barplot(x, y, palette='GnBu_d')
plt.title('Topic Distribution')
plt.xticks(ticks=range(len(x)))
plt.xlabel('Topic')
plt.ylabel('Frequency')

# plt.savefig(f'{file_prefix}__topic_dist.png');

In [None]:
stop = np.ceil(X['weight'].max() * 100) / 100
xs = np.linspace(0, stop, 1000)
ys = np.empty_like(xs)
for i, x in enumerate(xs):
    ys[i] = np.sum(X['weight'] > x) / len(X)

fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
fig.suptitle('Ratio of Question Weights Above Threshold', fontsize=16)

ax0.plot(xs, ys)
ax0.set_xlabel('Weight Threshold')
ax0.set_ylabel('Ratio')

ax1.plot(xs, ys)
ax1.set_xscale('log')
ax1.set_xlabel('Weight Threshold (Log Scale)')
ax1.set_ylabel('Ratio')

# plt.savefig(f'{file_prefix}__thresh_ratio.png');

In [None]:
def representative_sentences(n_sents=3):
    for group, df in X.groupby('topic'):
        print('Topic', group)

        repr_idxs = df['weight'].argsort()[-n_sents:].values
        repr_wghts = df['weight'].sort_values()[-n_sents:].values.round(4)    
        repr_sents = df['question_text'].values[repr_idxs]

        for sent, wght in zip(repr_sents, repr_wghts):
            print('\t', wght, sent)

        print('-' * 100)
        
representative_sentences()

In [None]:
def random_sentences(n_sents=3, random_state=None):
    for group, df in X.groupby('topic'):
        print('Topic', group)
        
        sample = df.sample(n_sents, random_state=random_state)
        rand_sents = sample['question_text'].values
        rand_wghts = sample['weight'].values.round(4)
        
        for sent, wght in zip(rand_sents, rand_wghts):
            print('\t', wght, sent)

        print('-' * 100)
        
random_sentences(random_state=0)