In [4]:
import pandas as pd

In [5]:
usecols = ['news_id','title', 'text']
news = pd.read_csv("/content/filter_data.csv", usecols = usecols)

In [6]:
news.head()

Unnamed: 0,title,news_id,text
0,What happened to Nex Benedict?,0,What happened to Nex Benedict?. Benedict was a...
1,Supreme Court again refuses to intervene in dr...,1,Supreme Court again refuses to intervene in dr...
2,RBG's family condemns the selection of recipie...,2,RBG's family condemns the selection of recipie...
3,EEOC Data Reveals 75% Of High-Earners Are Men,3,EEOC Data Reveals 75% Of High-Earners Are Men....
4,Gender equality? Denmark to introduce mandator...,4,Gender equality? Denmark to introduce mandator...


In [7]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
import string
from tqdm import tqdm

In [8]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

custom_stop_words = ["lgbt", 'lgbtq', 'woman', 'queer', 'people', 'gay', 'article', 'com', 'read']

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

In [9]:
parser = en_core_web_sm.load()

In [10]:
def spacy_tokenizer(sentence):
    mytokens = parser(str(sentence))
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [11]:
tqdm.pandas()
news["processed_text"] = news["text"].progress_apply(spacy_tokenizer)

100%|██████████| 1530/1530 [00:26<00:00, 57.77it/s]


In [12]:
news = news[~news['processed_text'].str.contains("removed")]
news = news[news['processed_text'] != 'nan']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [15]:
text = news['processed_text'].values
wordset = set([w for t in text for w in t.split()])

In [16]:
max_features = len(wordset)
X = vectorize(text, max_features)

In [17]:
X.shape

(1530, 7378)

In [None]:
#from sklearn.decomposition import PCA

In [None]:
#pca = PCA(n_components=0.95, random_state=42)
#X_reduced= pca.fit_transform(X.toarray())
#X_reduced.shape

In [18]:
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
k = 10 # k is currently fixed at 10
kmeans = KMeans(n_clusters=k, random_state=42, n_init = 'auto')
y_pred = kmeans.fit_predict(X)
news['cluster'] = y_pred

In [20]:
news.head()

Unnamed: 0,title,news_id,text,processed_text,cluster
0,What happened to Nex Benedict?,0,What happened to Nex Benedict?. Benedict was a...,happen nex benedict benedict attack oklahoma h...,9
1,Supreme Court again refuses to intervene in dr...,1,Supreme Court again refuses to intervene in dr...,supreme court refuse intervene drag controvers...,3
2,RBG's family condemns the selection of recipie...,2,RBG's family condemns the selection of recipie...,rbg family condemn selection recipient award h...,1
3,EEOC Data Reveals 75% Of High-Earners Are Men,3,EEOC Data Reveals 75% Of High-Earners Are Men....,eeoc data reveal 75 high earner men datum rele...,2
4,Gender equality? Denmark to introduce mandator...,4,Gender equality? Denmark to introduce mandator...,gender equality denmark introduce mandatory mi...,2


In [21]:
vectorizers = []
for ii in range(0, 10):
    # Creating a vectorizer for each cluster
    vectorizers.append(CountVectorizer(min_df=2, max_df=0.9,
                                       stop_words=stopwords, lowercase=True,
                                       token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [22]:
vectorized_data = []
for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(news.loc[news['cluster'] == current_cluster, 'processed_text']))
    except Exception as e:
        vectorized_data.append(None)

In [23]:
num_topics_per_cluster = 2
lda_models = []
for ii in range(0, 10):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=num_topics_per_cluster, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)

In [24]:
clusters_lda_data = []
for current_cluster, lda in enumerate(lda_models):
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [25]:
# Selecting keywords for each topic
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []

    for _, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])

    keywords.sort(key = lambda x: x[1])
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [26]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))
    else:
        all_keywords.append([])

In [27]:
cluster_represent_doc = pd.DataFrame(columns = ['cluster_no', 'news_id', 'news_title'])

In [28]:
def get_represent_docs(cluster_no):
    news_df = news[news.cluster == cluster_no]
    lda_model = lda_models[cluster_no]
    document_topic_distributions = lda_model.fit_transform(vectorized_data[cluster_no])

    num_top_documents = 2  # Number of top documents to retrieve for each topic
    representative_docs_per_topic = []

    for topic_idx in range(lda_model.n_components):
        top_document_indices = sorted(range(len(document_topic_distributions)),
                                    key=lambda i: document_topic_distributions[i][topic_idx],
                                    reverse=True)[:num_top_documents]

        top_documents = [news_df.iloc[i]['news_id'] for i in top_document_indices]
        representative_docs_per_topic.append(top_documents)
    l = [item for sublist in representative_docs_per_topic for item in sublist]
    return ','.join([str(i) for i in l])

In [29]:
get_represent_docs(0)

'1335,1325,219,840'

In [30]:
cluster_keywords = pd.DataFrame(columns = ['cluster', 'keywords', 'repre_news'])
for cluster, keywords in enumerate(all_keywords):
    kw_str = ', '.join(keywords)
    repre_docs = get_represent_docs(cluster)
    new_row = {'cluster': cluster, 'keywords': kw_str, 'repre_news': repre_docs}
    cluster_keywords.loc[len(cluster_keywords)] = new_row

cluster_keywords.to_csv("/cluster_keywords.csv")

In [31]:
news_cluster = pd.merge(news, cluster_keywords, left_on = 'cluster', right_on = 'cluster')
news_cluster['news_id'] = news_cluster['news_id'].astype('int')
news_cluster = news_cluster.sort_values(by = 'news_id')

news_cluster.to_csv("/news_cluster.csv")

In [32]:
with open('/cluster_keywords.txt', 'a') as f:
    pd.set_option('display.max_colwidth', None)
    dfAsString = cluster_keywords.keywords.to_string(header=False, index=False)
    f.write(dfAsString)

In [33]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [34]:
cluster_keywords

Unnamed: 0,cluster,keywords,repre_news
0,0,"trump, abortion, donald, support, right, association",13351325219840
1,1,"world, black, big, diversity, manager, learning",127212471237528
2,2,"gender, business, gap, surgery, find",120813573691198
3,3,"transgender, court, athlete, sport, law, judge",8471257796267
4,4,"new, family, crime, want, market",115510704148
5,5,"easter, sunday, biden",843875709872
6,6,"bill, ban, state",386148578855
7,7,"dei, bridge, generative, collapse, company",101685110361351
8,8,"biden, president, joe, election, new",761115711441194
9,9,"year, women, right, transgender, post, anti",106811402341195


In [35]:
news_cluster.head()

Unnamed: 0,title,news_id,text,processed_text,cluster,keywords,repre_news
0,What happened to Nex Benedict?,0,"What happened to Nex Benedict?. Benedict was attacked at their Oklahoma high school, and found dead at home the next day. The medical examiner's office says the nonbinary 16-year-old died by suicide. Their family has questions.",happen nex benedict benedict attack oklahoma high school find dead home day medical examiner office nonbinary 16 year old die suicide family question,9,"year, women, right, transgender, post, anti",106811402341195
711,Supreme Court again refuses to intervene in drag show controversy,1,Supreme Court again refuses to intervene in drag show controversy. It was the second time the court has refused to step into a drag controversy this term.,supreme court refuse intervene drag controversy second time court refuse step drag controversy term,3,"transgender, court, athlete, sport, law, judge",8471257796267
796,RBG's family condemns the selection of recipients of an award named in her honor,2,RBG's family condemns the selection of recipients of an award named in her honor. The Justice Ruth Bader Ginsburg Leadership Award was created to honor women who have created positive change in society. RBG's family says this year's change in selection criteria betrays her legacy.,rbg family condemn selection recipient award honor justice ruth bader ginsburg leadership award create honor create positive change society rbg family year change selection criterion betray legacy,1,"world, black, big, diversity, manager, learning",127212471237528
932,EEOC Data Reveals 75% Of High-Earners Are Men,3,"EEOC Data Reveals 75% Of High-Earners Are Men. Data just released by the EEOC exposes a clear gender pay gap, with men outearning women, especially in the highest-paid jobs. The gap is widest for women of color.",eeoc data reveal 75 high earner men datum release eeoc expose clear gender pay gap man outearne especially highest pay job gap widest color,2,"gender, business, gap, surgery, find",120813573691198
933,Gender equality? Denmark to introduce mandatory military service FOR WOMEN,4,"Gender equality? Denmark to introduce mandatory military service FOR WOMEN. Denmark may become the newest Scandinavian country to begin conscripting women into its military, broadening the country’s recruitment program to strengthen national security amid the ongoing conflict in Ukraine. The Danish government’s plan to recruit women …",gender equality denmark introduce mandatory military service women denmark new scandinavian country begin conscript military broaden country recruitment program strengthen national security amid ongoing conflict ukraine danish government plan recruit …,2,"gender, business, gap, surgery, find",120813573691198


In [36]:
news_cluster.value_counts('cluster')

cluster
9    711
1    136
4    130
2    108
7    105
3     85
6     85
8     75
0     57
5     38
Name: count, dtype: int64