# Import Libraries 

In [142]:
import pandas as pd 
import numpy as np
import re, string
import nltk
from nltk.stem.porter import *
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer 
import matplotlib.pyplot as plt 
import collections
import seaborn as sns 

# Load Data

In [22]:
df_titles = pd.read_csv('Article_Names.csv')  #load data into a dataframe
list_titles = df_titles['Article Title'] #create list of article titles 

# Pre-Processing Data

In [23]:
#This function lowercases and stems words and removes punctuation and stopwords 
#Returns a list of strings 
def preprocess(titles): 
    
    stemmer = PorterStemmer() #stemmer 
    stopwords = nltk.corpus.stopwords.words('english') #list of stopwords
    
    processed_titles = []
    for title in titles:
        #lowercases words 
        title = title.lower().strip()
        #removes punctuations using regex 
        title = re.sub(r'[^\w\s]','', title)
        title = title.split()
        
        processed_words = []
        for word in title:
            #removes stopwords and stems word
            if word not in stopwords:
                processed_words.append(stemmer.stem(word))
        
        
        processed_titles.append(' '.join(processed_words)) 
    
    return processed_titles
                
                
        

# Feature Generation

In [51]:
processed_titles = preprocess(list_titles)
vectorizer = TfidfVectorizer(use_idf = True)
features = vectorizer.fit_transform(processed_titles).toarray() #generate a tf-idf vector 

# K-Means Clustering

In [52]:
#Calculates and prints the average silhouette score for models with cluster size 2-max_cluster
def silhouette_score(max_cluster)
    for i in range(2, max_cluster):
        model = KMeans(n_clusters = i)
        model.fit(features)
        labels = model.predict(features)
        print(i, silhouette_score(features, labels))

silhouette_score(21)

2 0.001936937752596225
3 0.00272541071708876
4 0.0023771092165205173
5 0.0026318895437390916
6 0.0036208231682994508
7 0.003623235254759373
8 0.0048392728485729085
9 0.004090499345093416
10 0.004313758369843795
11 0.0031837756828562826
12 0.003465474558874619
13 0.00348180490002583
14 0.003966518129211502
15 0.004458336732950941
16 0.004005657961285879
17 0.004405023925238152
18 0.005076127745440575
19 0.004148689996154069


In [53]:
#run model with optimal cluster number
model = KMeans(n_clusters = 18)
model.fit(features)

labels = model.predict(features) #generate labels 
df_titles['Clusters'] = labels.tolist() #add labels to df 

In [149]:
df_titles['Clusters'].value_counts() #counts in each cluster



0     329
4     100
6      93
2      79
3      71
5      68
14     64
11     64
12     60
8      56
10     53
15     52
17     44
13     41
1      40
16     32
9      28
7      26
Name: Clusters, dtype: int64

In [109]:
df['Processed'] = processed_titles

#returns a dictionary with the 10 most common words in each cluster 
def cluster_common_words(df, df_columns):
    common_words_dict = {}
    for i in range(0, 18):
        titles = df_titles[df.values == i][df_columns]
        #creates list of tuples of counts 
        common_words = collections.Counter(" ".join(titles).split()).most_common(10) 
        #adds list of tuples to a dictionary 
        common_words_dict[i] = common_words 
    
    return common_words_dict
    

In [117]:
cluster_common_words(df_titles, 'Processed')


{0: [('model', 21),
  ('base', 17),
  ('human', 17),
  ('respons', 15),
  ('diseas', 14),
  ('character', 14),
  ('chang', 13),
  ('detect', 12),
  ('applic', 12),
  ('correct', 12)],
 1: [('review', 38),
  ('systemat', 35),
  ('metaanalysi', 20),
  ('patient', 8),
  ('associ', 6),
  ('impact', 5),
  ('scope', 5),
  ('effect', 5),
  ('children', 4),
  ('diseas', 4)],
 2: [('covid19', 34),
  ('factor', 17),
  ('patient', 16),
  ('survey', 15),
  ('global', 12),
  ('studi', 11),
  ('among', 9),
  ('pandem', 9),
  ('risk', 7),
  ('social', 7)],
 3: [('studi', 39),
  ('control', 20),
  ('associ', 20),
  ('case', 13),
  ('women', 5),
  ('activ', 5),
  ('data', 5),
  ('patient', 5),
  ('factor', 5),
  ('individu', 4)],
 4: [('studi', 42),
  ('effect', 40),
  ('cohort', 24),
  ('diseas', 18),
  ('chronic', 16),
  ('patient', 14),
  ('prospect', 13),
  ('manag', 12),
  ('kidney', 9),
  ('hepat', 8)],
 5: [('differ', 22),
  ('cancer', 22),
  ('screen', 9),
  ('suscept', 9),
  ('insight', 8),
  