In [1]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN # clustering algorithms
from sklearn.decomposition import PCA # dimensionality reduction




df_abstracts = pd.read_csv("../../data/RELIGION_abstracts.csv").drop(columns="Unnamed: 0")
df_abstracts.info()
df_abstracts.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701 entries, 0 to 700
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     701 non-null    object
 1   abstract  701 non-null    object
 2   link      701 non-null    object
 3   volume    701 non-null    object
dtypes: object(4)
memory usage: 22.0+ KB


Unnamed: 0,title,abstract,link,volume
count,701,701,701,701
unique,701,701,701,40
top,Norwegian Muslims denouncing terrorism: beyond...,"In contemporary European societies, Muslims ar...",https://www.tandfonline.com/doi/full/10.1080/0...,https://www.tandfonline.com/loi/rrel20?treeId=...
freq,1,1,1,41


In [2]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")

def lemmatizeAbstracts(x):
        doc = nlp(x)
        new_text = []
        for token in doc:
            new_text.append(token.lemma_)
        text_string = " ".join(new_text)
        # getting rid of non-word characters
        text_string = re.sub(r"[^\w\s]+", "", text_string)
        text_string = re.sub(r"\s{2,}", " ", text_string)
        return text_string

df_abstracts["abstract_lemma"] = df_abstracts["abstract"].apply(lemmatizeAbstracts)
df_abstracts.to_csv("../../data/RELIGION_abstracts_lemmatized.csv")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
df_abstracts_tfidf = tfidf.fit_transform(df_abstracts["abstract_lemma"])



In [4]:
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=250, strip_accents="unicode", min_df=10, max_df=200)
tfidf_religion_array = tfidf.fit_transform(df_abstracts["abstract_lemma"])
df_abstracts_tfidf = pd.DataFrame(tfidf_religion_array.toarray(), index=df_abstracts.index, columns=tfidf.get_feature_names())
df_abstracts_tfidf.describe()



Unnamed: 0,academic,account,activity,address,african,agency,aim,allow,american,analyse,...,use,value,various,view,way,western,woman,work,world,year
count,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0,...,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0,701.0
mean,0.017407,0.017756,0.010633,0.011059,0.011356,0.009061,0.010451,0.009127,0.01521,0.011713,...,0.033074,0.014268,0.013633,0.02131,0.027744,0.021257,0.018406,0.028048,0.028877,0.014563
std,0.063839,0.067667,0.051894,0.047114,0.065659,0.053946,0.04508,0.042445,0.068153,0.051143,...,0.07419,0.061468,0.052642,0.064568,0.062593,0.078245,0.095377,0.074917,0.073751,0.052971
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.493789,0.709637,0.509307,0.360085,0.737092,0.658383,0.359469,0.409185,0.617276,0.457299,...,0.609089,0.828761,0.534004,0.51541,0.364839,0.72476,0.941362,0.604694,0.492094,0.365748


In [5]:
pca = PCA(n_components=10, whiten=False, random_state=42)
abstracts_pca = pca.fit_transform(df_abstracts_tfidf)
df_abstracts_pca = pd.DataFrame(data=abstracts_pca)

In [6]:
kmeans = KMeans(n_clusters=100, random_state=42)
abstracts_labels = kmeans.fit_predict(df_abstracts_pca)
df_abstracts_labeled = df_abstracts.copy()
df_abstracts_labeled["cluster"] = abstracts_labels


In [7]:
df_abstracts_labeled[df_abstracts_labeled["cluster"] == 75][["title", "cluster"]]


Unnamed: 0,title,cluster
28,The space between us: considering online media...,75
39,Towards increasing diversity in the study of r...,75
61,"Qur’anic terminology, translation, and the Isl...",75
88,From nation-state to market: The transformatio...,75
243,"More than belief, but not more than belief and...",75
279,The role of evolutionary psychology within an ...,75
389,A case of misrepresentation: James L. Cox and ...,75
436,Epidemiology and the study of religion,75
682,The theological enemies of religious studies,75


In [8]:
df_abstracts_labeled[df_abstracts_labeled["cluster"] == 15][["title", "cluster"]]


Unnamed: 0,title,cluster
8,Engineering self and civil society: the promis...,15
139,Contemporary fantasy fiction and representatio...,15
152,Socialist religion and the emergence of occult...,15
474,Orisha Worship Communities: A Reconsideration ...,15
609,Cultural continuity and cultural hegemony: Ita...,15


In [9]:
df_abstracts_labeled[df_abstracts_labeled["cluster"] == 84][["title", "cluster"]]


Unnamed: 0,title,cluster
58,The promise of the universal: non-Buddhists’ a...,84
246,Where angels fear to tread: neurophenomenology...,84
275,On the market: consumption and material cultur...,84
624,Contemporary Theravāda and Zen Buddhist attitu...,84


In [10]:
findOptimalEps(2, df_abstracts_tfidf)


NameError: name 'findOptimalEps' is not defined

In [None]:
dbscan = DBSCAN(eps=0.2, metric="euclidean")
dbscan_labels = dbscan.fit_predict(df_abstracts_pca)
df_abstracts_dbscan = df_abstracts.copy()
df_abstracts_dbscan["cluster"] = dbscan_labels
df_abstracts_dbscan["cluster"].unique()


array([ 0,  1, -1,  2,  3], dtype=int64)

In [None]:
df_abstracts_dbscan[df_abstracts_dbscan["cluster"] == 1][["title", "cluster"]]


Unnamed: 0,title,cluster
14,Imagining Buddhist modernism: the shared relig...,1
20,Why Durkheim really thought that Buddhism was ...,1
158,Textbook Buddhism: introductory books on the B...,1
439,Recent trends in Sri Lankan Buddhism,1
471,William James and Buddhism: American Pragmatis...,1
559,Buddhist Environmental Ethics and Detraditiona...,1
620,Buddhadharma and contemporary ethics,1
631,How environmentalist is Buddhism?,1
638,Protestant Buddhism?,1
690,Burial ‘ad sanctos’ and the physical presence ...,1


In [None]:
df_abstracts_dbscan[df_abstracts_dbscan["cluster"] == 2][["title", "cluster"]]


Unnamed: 0,title,cluster
288,Wither or whither: the study of religion at th...,2
289,Contextualization of Religious Studies and of ...,2
584,The academic study of Buddhism in the United S...,2
681,Postulations for safeguarding preconceptions: ...,2
