In [None]:
import pandas as pd
dataframe = pd.read_csv("Research Articles.csv")
dataframe = dataframe.drop(["ID", "TITLE"], axis = 1)

# Reduce dataframe to make testing faster
dataframe = dataframe.drop( range(500, len(dataframe)) )

In [None]:
### Text Cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re 

stopwords = stopwords.words("english")
abstract = dataframe["ABSTRACT"]
corpus = []

for i in range(len(abstract)):
  lemmatizer = WordNetLemmatizer()
  stemmer = PorterStemmer()
  document = abstract[i]
  
  # Remove on-letters
  document = re.sub( "[^a-zA-Z]", " ", document )
  # Minimize, Split, Lemmatize, and Stem documents
  document = document.lower().split()
  document = [stemmer.stem(lemmatizer.lemmatize(word)) for word in document if (word not in stopwords and len(stemmer.stem(lemmatizer.lemmatize(word)))>1)]

  document = " ".join(document)
  corpus.append(document)

print(f"Length of Corpus: {len(corpus)}")

In [None]:
# ### Create Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorized_features = vectorizer.fit_transform(corpus).toarray()
wordset = vectorizer.get_feature_names_out()

In [None]:
### Create co_occurence matrix
features = pd.DataFrame(vectorized_features, columns = wordset)
transposed_features = features.T
co_occurrence_matrix = transposed_features.values.dot(features.values)
co_occurrence_dataframe = pd.DataFrame(co_occurrence_matrix, columns = wordset, index = wordset)

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1,11):
  kmeans = KMeans( init = 'k-means++', n_clusters = i )
  print(f"Training {i}")
  kmeans.fit(co_occurrence_dataframe.values)
  wcss.append(kmeans.inertia_)

In [None]:
import matplotlib.pyplot as plt
plt.plot( range(1,11),  wcss )
plt.title("Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
kmeans = KMeans( init = 'k-means++', n_clusters = 5 )
features = co_occurrence_dataframe.values
class_label = kmeans.fit_predict(features)