### Project 2: Performing Clustering

Imports:

In [2]:
import random
import pandas as pd
from sklearn import feature_extraction
from sklearn.cluster import KMeans
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

Cluster Function:

In [3]:
def cluster(stopwords, data_df, cluster_prefix = "Main"):

    #Now make a new vectorizer with those stopwords excluded; this version will use TF-IDF feature weighting
    features = feature_extraction.text.TfidfVectorizer(input='content', 
                                                    encoding='utf-8', 
                                                    decode_error='ignore', 
                                                    lowercase=True,
                                                    stop_words = stopwords,
                                                    tokenizer = None,
                                                    ngram_range=(1, 1), 
                                                    analyzer='word', 
                                                    max_features=10000,   #Larger vocabulary for clustering
                                                    )

    #Sklearn first fits then transforms
    features.fit(data_df.loc[:,"text"].values)

    #Now extract the content features (with phrases, without stopwords)
    x = features.transform(data_df.loc[:,"text"].values)
    print(x)
    print(x.shape)

    #Cluster documents by content
    cluster = KMeans(n_clusters = 10,       #The number of topics we'll get
                        init = "k-means++", 
                        n_init = "auto", 
                        max_iter = 30000, 
                        algorithm = "lloyd",
                        )

    #Perform clustering
    cluster.fit(x)

    #Add topic labels to dataframe
    data_df.loc[:,"Topic"] = [cluster_prefix+"_"+str(x) for x in cluster.labels_]

    #Sort by topic
    data_df.sort_values("Topic", inplace = True)
    print(data_df)
    print(data_df.value_counts("Topic"))

    #Get the biggest topic for further splitting
    most_frequent = data_df.value_counts("Topic").index[0]
    most_frequent_count = data_df.value_counts("Topic").iloc[0]
    print("Most frequent: ", most_frequent)

    #Separate the main topic from other topics
    main_topic = data_df[data_df.loc[:,"Topic"] == most_frequent]
    other_topics = data_df[data_df.loc[:,"Topic"] != most_frequent]

    #Send back the two dataframes
    return main_topic, other_topics, most_frequent

Load Corpus:

In [4]:
df = pd.read_parquet('Sampled_Data.parquet.gzip')

print(df.shape)
df.head()

(125736, 4)


Unnamed: 0,id,title,text,categories
29181,23968555,Abagrotis mirabilis,Abagrotis mirabilis is a moth of the family No...,"[Moths of North America, Moths described in 1879]"
185692,21329784,All Saints Church at Monie,All Saints Church at Monie is a historic Episc...,"[Episcopal church buildings in Maryland, Churc..."
414111,204077,August Potthast,"August Potthast (13 August 1824, Höxter, Provi...","[1824 births, 1898 deaths, People from Höxter,..."
400100,14852169,Athletics at the 2000 Summer Olympics – Men's ...,The men's 800 metres event at the 2000 Summer ...,"[Athletics at the 2000 Summer Olympics, 800 me..."
241233,56485290,Anatoly Glushenkov,"Anatoly Yegorovich Glushenkov (, 20 November 1...","[1942 births, 2018 deaths, Governors of Smolen..."


Find Multi-Word Expressions:

In [5]:
phrase_model = Phrases([doc.split() for doc in df.loc[:,"text"].values], 
                        min_count = 2, 
                        threshold = 0.7, 
                        connector_words = ENGLISH_CONNECTOR_WORDS, scoring = "npmi"
                        )

print(phrase_model.export_phrases().keys())
print("ABOVE: Learned phrases")

#Replace phrases in the df
df.loc[:,"Text"] = [" ".join(phrase_model[sentence.split()]) for sentence in df.loc[:,"text"]]

ABOVE: Learned phrases


Find Stopwords By what is Most Common

In [6]:
#First find the most frequent words
features = feature_extraction.text.CountVectorizer(input='content', 
                                                encoding='utf-8', 
                                                decode_error='ignore', 
                                                lowercase=True, 
                                                tokenizer = None,
                                                ngram_range=(1, 1), 
                                                analyzer='word', 
                                                max_features=500,   #Choose number of future stopwords
                                                )

#Sklearn first fits then transforms
features.fit(df.loc[:,"text"].values)
#The most frequent words can be found in the dictionary of vocabulary items
stopwords = list(features.vocabulary_.keys())
print(stopwords)
print("ABOVE: Frequent words to exclude")

['is', 'of', 'the', 'family', 'first', 'described', 'by', 'in', 'it', 'found', 'western', 'north', 'america', 'from', 'british', 'south', 'to', 'california', 'about', 'on', 'and', 'species', 'references', 'october', '11', '2008', 'category', 'all', 'church', 'at', 'located', 'county', 'single', 'story', 'style', 'building', 'five', 'one', 'built', 'well', 'small', 'also', 'with', '20th', 'century', 'was', 'national', '1990', 'external', 'links', 'including', 'august', '13', 'february', 'german', 'born', 'he', 'known', 'through', 'his', 'european', 'history', 'work', 'form', 'an', 'their', 'between', 'new', 'births', 'deaths', 'people', 'male', 'non', 'men', 'event', '2000', 'summer', 'as', 'part', 'held', 'australia', '23', 'september', '25', '27', 'number', 'had', 'been', 'set', 'since', 'won', 'germany', 'championship', 'for', 'each', 'world', 'record', 'race', 'three', 'were', 'second', 'made', 'not', 'final', 'down', 'back', 'moved', 'just', 'under', '20', 'into', 'around', 'next',

Loop for Clustering

In [7]:
#Create a loop to continue clustering until the largest category is not too big
main_topic = df    #Initialize main topic
cluster_prefix = "Topic"     #Start with root topics
holder = []
starting_length = len(df)
counter = 0

while True:

    #Run clustering
    counter += 1
    main_topic, other_topics, most_frequent = cluster(stopwords, main_topic, cluster_prefix)
    cluster_prefix = str(most_frequent)

    #Check stopping conditions, no topic over 5% of documents
    if len(main_topic)/len(df) < 0.05:
        holder.append(other_topics)
        holder.append(main_topic)
        break

    #Keep going
    else:
        holder.append(other_topics)
        print("Continuing after round " + str(counter), "Current: ", len(main_topic), "Total: ", starting_length)

  (0, 9858)	0.26837816386729496
  (0, 6419)	0.2044983951164744
  (0, 6173)	0.43758493663175974
  (0, 6170)	0.22094299350468796
  (0, 6092)	0.21779107653840163
  (0, 5380)	0.2681522268624662
  (0, 4028)	0.23144842643784802
  (0, 3844)	0.23602544992229482
  (0, 2287)	0.19618970797194318
  (0, 1252)	0.2751694922810105
  (0, 1096)	0.25116972293925116
  (0, 513)	0.15950690833584752
  (0, 218)	0.46314672942923013
  (1, 9362)	0.06750225282773657
  (1, 8959)	0.05255196822963678
  (1, 8481)	0.34342960887918383
  (1, 8387)	0.06353988630445567
  (1, 8012)	0.15011622427708873
  (1, 7981)	0.05883296922271839
  (1, 7927)	0.05981472504596646
  (1, 7842)	0.053751742596714495
  (1, 7620)	0.182917155623065
  (1, 7286)	0.06085929429533192
  (1, 7265)	0.051553779429216885
  (1, 7168)	0.0737175982837945
  :	:
  (125735, 3259)	0.06281177175850497
  (125735, 3086)	0.07105461087420764
  (125735, 2989)	0.09980068843470488
  (125735, 2497)	0.0657100870626423
  (125735, 2495)	0.08073501543370289
  (125735, 2494)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df.sort_values("Topic", inplace = True)


              id                                       title  \
3674    66838226                                    HD 76270   
50350   21641446                             Uremic pruritus   
181969  33562947                                  Shamrock V   
456955   4385129                     Superior gluteal artery   
25876    6387142  Unfair Commercial Practices Directive 2005   
...          ...                                         ...   
186823  43677838                              Roland Paskoff   
23516   47070682                             Vasco M. Tanner   
123298  34897828                            Richard S. Morse   
164056  58138266                  Robert Ssentongo (surgeon)   
130537  31476736                               Joachim Maier   

                                                     text  \
3674    HD 76270, also known as HR 3544, is a solitary...   
50350   Uremic pruritus is caused by chronic kidney fa...   
181969  Shamrock V was the first British yacht t

Merging and Saving Data

In [15]:
# Merge
df = pd.concat(holder)
df.sort_values("Topic", inplace = True)
# Reorder columns
df = df.loc[:,["Topic", "id", "title", "text", "categories"]]

# Get unique topics
unique_topics = df['Topic'].unique()

# Save separate dataframes based on topic
for topic in unique_topics:
    topic_df = df[df['Topic'] == topic]
    topic_df.to_csv(f"{topic}_Data.csv", index=False)

# Save the overall dataframe and topic counts
df.to_csv("Wikipedia_Clustered.csv", index=False)
df.value_counts("Topic").to_csv("Topic_Counts.csv")

Calculate Cosine Similarities for the topics

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


# Calculate cosine similarity for each topic
cosine_similarities = []
vectorizer = TfidfVectorizer()
sum = 0
count = 0

for topic in unique_topics:
    topic_texts = df[df['Topic'] == topic]['text'].tolist()
    tfidf_matrix = vectorizer.fit_transform(topic_texts)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    mean_similarity = similarity_matrix.mean()
    cosine_similarities.append((topic, mean_similarity))
    sum += mean_similarity
    count += 1

cosine_similarities.append(("Average", sum / count))

sampled_df = df.sample(frac=0.05, random_state=42)

# Extract texts from the sampled DataFrame
topic_texts = sampled_df['text'].tolist()

# Calculate cosine similarity for the sampled texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(topic_texts)
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
mean_similarity = similarity_matrix.mean()

# Append the result to the list
cosine_similarities.append(("Without Clustering (Sampled)", mean_similarity))

# Create a DataFrame
similarity_df = pd.DataFrame(cosine_similarities, columns=['Topic', 'Cosine_Similarity_Score'])

# Save the DataFrame to a CSV file
similarity_df.to_csv("cosine_similarity_scores.csv", index=False)