In [1]:
from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from lightning.classification import CDClassifier
from scipy.sparse import csr_matrix
import pandas as pd
import scattertext as st
import numpy as np
import spacy
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%matplotlib inline  

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups_train.data)
count_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_)
feature_vocabulary = vectorizer.vocabulary_

In [3]:
st.IndexStoreFromDict.build(feature_vocabulary).getnumvals()

101631

In [4]:
corpus = st.Corpus(
    X=count_vectorizer.fit_transform(newsgroups_train.data),
    mX=csr_matrix((0,0)),
    y=newsgroups_train.target,
    term_idx_store=st.IndexStoreFromDict.build(feature_vocabulary),
    metadata_idx_store=st.IndexStore(),
    category_idx_store=st.IndexStoreFromList.build(newsgroups_train.target_names),
    raw_texts = np.array(newsgroups_train.data)
)

In [5]:
corpus._term_idx_store._i2val[2]

'0000'

In [6]:
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0,
                   tol=1e-3)
clf.fit(X, newsgroups_train.target)

CDClassifier(C=1.0, Cd=1.0, alpha=0.0001, beta=0.5, callback=None,
       debiasing=False, loss='squared_hinge', max_iter=20,
       max_steps='auto', multiclass=True, n_calls=100, n_jobs=1,
       penalty='l1/l2', permute=True, random_state=None,
       selection='cyclic', shrinking=True, sigma=0.01,
       termination='violation_sum', tol=0.001, verbose=0,
       warm_debiasing=False, warm_start=False)

In [7]:
tdm = corpus.get_term_freq_df()
tdm['score'] = clf.coef_[0]


In [8]:
nonzero_corpus = corpus.remove_terms(tdm[tdm.score == 0].index)


In [9]:
sum(tdm[[c for c in tdm.columns if c != 'score']].sum(axis=1) > 60)

3679

In [10]:
html = st.produce_fightin_words_explorer(nonzero_corpus, 
                                         category='alt.atheism',
                                         minimum_term_frequency=60,
                                         #scores = tdm.loc[nonzero_corpus.get_terms()].score,
                                         max_docs_per_category=100)

In [19]:
from scattertext.Scalers import scale_neg_1_to_1_with_zero_mean_abs_max, scale
from scattertext.termsignificance.LogOddsRatioUninformativeDirichletPrior import LogOddsRatioUninformativeDirichletPrior
produce_scattertext_explorer = st.produce_scattertext_explorer

category = 'alt.atheism'
DEFAULT_MINIMUM_TERM_FREQUENCY = 3
category_name= None
not_category_name = None
kwargs = {}
kwargs['minimum_term_frequency'] = 60
term_ranker=st.termranking.AbsoluteFrequencyRanker
alpha=0.01
use_term_significance=True

if category_name is None:
    category_name = category
if not_category_name is None:
    not_category_name = "Not " + category_name
    
scale_offset = DEFAULT_MINIMUM_TERM_FREQUENCY
if 'minimum_term_frequency' in kwargs:
    scale_offset = kwargs['minimum_term_frequency']
term_freq_df = term_ranker(corpus).get_ranks()
frequencies_log_scaled = scale(np.log(term_freq_df.sum(axis=1).values) - np.log(scale_offset))

if 'scores' not in kwargs:
    zeta_i_j = (LogOddsRatioUninformativeDirichletPrior(alpha)
                .get_zeta_i_j_given_separate_counts(term_freq_df[category + ' freq'],
                                                    term_freq_df[[c + ' freq'
                                                                  for c in corpus.get_categories()
                                                                  if c != category]].sum(axis=1)))
    kwargs['scores'] = kwargs.get('scores', zeta_i_j)

scores_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(kwargs['scores'])
# kwargs['metadata'] = kwargs.get('metadata', None),
if use_term_significance:
    kwargs['term_significance'] = LogOddsRatioUninformativeDirichletPrior(alpha)

html = produce_scattertext_explorer(corpus,
                                    category=category,
                                    category_name=category_name,
                                    not_category_name=not_category_name,
                                    x_coords=frequencies_log_scaled,
                                    y_coords=scores_scaled_for_charting,
                                    sort_by_dist=False,
                                    term_ranker=term_ranker,
                                    p_value_colors=True,
                                    # x_label=kwargs.get('x_label', 'Log Frequency'),
                                    # y_label=kwargs.get('y_label', 'Z-Score: Log Odds Ratio w/ Prior'),
                                    **kwargs)


In [23]:
max(frequencies_log_scaled)

1.0

In [20]:
file_name = "alt.atheism.html"
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=1000)

In [18]:
category = 'alt.atheism'
category_max = 3000
not_category_max = 3000

catdf = pd.DataFrame({category: tdf[category + ' freq'], 
                      'not ' + category: tdf.sum(axis=1) - tdf[category + ' freq']})
catdf['score'] = corpus.get_scaled_f_scores(category)



NameError: name 'tdf' is not defined

In [None]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='alt.atheism', 
                                       category_name='alt.atheism', 
                                       not_category_name='Not alt.atheism',
                                       minimum_term_frequency=20,
                                       not_category_term_frequency=[20]
                                       use_full_doc=True,
                                       width_in_pixels=1000)
file_name = "alt.atheism.html"
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=1000)

In [None]:
X.shape == (len(y), len(vectorizer.vocabulary_))

In [None]:
# Set classifier options.


In [None]:
clf.fit(X_train, data_train.target)

In [None]:
clf.coef_.shape

In [None]:
sum(clf.coef_[0] != 0)