In [9]:
import csv
import pandas as pd

def load_csv_with_error_handling(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for i, row in enumerate(reader):
            try:
                data.append(row)
            except csv.Error as e:
                print(f"Error reading row {i}: {e}")
    return pd.DataFrame(data[1:], columns=data[0])

# Load data with error handling
data = load_csv_with_error_handling("all_sources_metadata_2020-03-13.csv")

print(data.head())

                                        sha source_x  \
0  c630ebcdf30652f0422c3ec12a00b50241dc9bd9      CZI   
1  53eccda7977a31e3d0f565c884da036b1e85438e      CZI   
2  210a892deb1c61577f6fba58505fd65356ce6636      CZI   
3  e3b40cc8e0e137c416b4a2273a4dca94ae8178cc      CZI   
4  92c2c9839304b4f2bc1276d41b1aa885d8b364fd      CZI   

                                               title  \
0  Angiotensin-converting enzyme 2 (ACE2) as a SA...   
1  Comparative genetic analysis of the novel coro...   
2  Incubation Period and Other Epidemiological Ch...   
3  Characteristics of and Public Health Responses...   
4       Imaging changes in severe COVID-19 pneumonia   

                          doi pmcid pubmed_id   license  \
0  10.1007/s00134-020-05985-9        32125455  cc-by-nc   
1   10.1038/s41421-020-0147-1                     cc-by   
2          10.3390/jcm9020538                     cc-by   
3          10.3390/jcm9020575        32093211     cc-by   
4  10.1007/s00134-020-05976-w  

In [4]:
print(data.head())

                                        sha source_x  \
0  c630ebcdf30652f0422c3ec12a00b50241dc9bd9      CZI   
1  53eccda7977a31e3d0f565c884da036b1e85438e      CZI   
2  210a892deb1c61577f6fba58505fd65356ce6636      CZI   
3  e3b40cc8e0e137c416b4a2273a4dca94ae8178cc      CZI   
4  92c2c9839304b4f2bc1276d41b1aa885d8b364fd      CZI   

                                               title  \
0  Angiotensin-converting enzyme 2 (ACE2) as a SA...   
1  Comparative genetic analysis of the novel coro...   
2  Incubation Period and Other Epidemiological Ch...   
3  Characteristics of and Public Health Responses...   
4       Imaging changes in severe COVID-19 pneumonia   

                          doi pmcid pubmed_id   license  \
0  10.1007/s00134-020-05985-9        32125455  cc-by-nc   
1   10.1038/s41421-020-0147-1                     cc-by   
2          10.3390/jcm9020538                     cc-by   
3          10.3390/jcm9020575        32093211     cc-by   
4  10.1007/s00134-020-05976-w  

In [5]:
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_numeric, strip_punctuation, strip_multiple_whitespaces
from gensim.utils import simple_preprocess
import nltk
from nltk.stem import PorterStemmer


In [10]:
# Data Preprocessing
data.dropna(subset=['title'], inplace=True)
data.drop_duplicates(subset=['title'], inplace=True)



In [11]:
from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces, strip_punctuation, strip_numeric, remove_stopwords

# Text Preprocessing using Gensim
def custom_preprocess(text):
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation, strip_numeric, remove_stopwords]
    return preprocess_string(text, CUSTOM_FILTERS)

data['processed_title'] = data['title'].apply(custom_preprocess)


In [12]:
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')

# Stemming
stemmer = PorterStemmer()
def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text])

data['stemmed_title'] = data['processed_title'].apply(stem_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Display final preprocessed dataset
print(data[['title', 'processed_title', 'stemmed_title']].head())


                                               title  \
0  Angiotensin-converting enzyme 2 (ACE2) as a SA...   
1  Comparative genetic analysis of the novel coro...   
2  Incubation Period and Other Epidemiological Ch...   
3  Characteristics of and Public Health Responses...   
4       Imaging changes in severe COVID-19 pneumonia   

                                     processed_title  \
0  [angiotensin, converting, enzyme, ace, sars, c...   
1  [comparative, genetic, analysis, novel, corona...   
2  [incubation, period, epidemiological, characte...   
3  [characteristics, public, health, responses, c...   
4       [imaging, changes, severe, covid, pneumonia]   

                                       stemmed_title  
0  angiotensin convert enzym ace sar cov receptor...  
1  compar genet analysi novel coronaviru ncov sar...  
2  incub period epidemiolog characterist novel co...  
3  characterist public health respons coronaviru ...  
4                   imag chang sever covid pneumoni

In [10]:
print(data.head())

                                        sha source_x  \
0  c630ebcdf30652f0422c3ec12a00b50241dc9bd9      CZI   
1  53eccda7977a31e3d0f565c884da036b1e85438e      CZI   
2  210a892deb1c61577f6fba58505fd65356ce6636      CZI   
3  e3b40cc8e0e137c416b4a2273a4dca94ae8178cc      CZI   
4  92c2c9839304b4f2bc1276d41b1aa885d8b364fd      CZI   

                                               title  \
0  Angiotensin-converting enzyme 2 (ACE2) as a SA...   
1  Comparative genetic analysis of the novel coro...   
2  Incubation Period and Other Epidemiological Ch...   
3  Characteristics of and Public Health Responses...   
4       Imaging changes in severe COVID-19 pneumonia   

                          doi pmcid pubmed_id   license  \
0  10.1007/s00134-020-05985-9        32125455  cc-by-nc   
1   10.1038/s41421-020-0147-1                     cc-by   
2          10.3390/jcm9020538                     cc-by   
3          10.3390/jcm9020575        32093211     cc-by   
4  10.1007/s00134-020-05976-w  

In [13]:
##### feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Feature Extraction
vec2word_vectorizer = CountVectorizer(analyzer='word', max_df=0.95, min_df=2, stop_words='english')
tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_df=0.95, min_df=2, stop_words='english')

vec2word_features = vec2word_vectorizer.fit_transform(data['stemmed_title'])
tfidf_features = tfidf_vectorizer.fit_transform(data['stemmed_title'])


In [14]:
#### LDA

from sklearn.decomposition import LatentDirichletAllocation

# Topic Modeling - LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_vec2word = lda.fit_transform(vec2word_features)
lda_tfidf = lda.fit_transform(tfidf_features)


In [15]:
#### DIMENSIONALITY REDUCTION

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
lda_vec2word_pca = pca.fit_transform(lda_vec2word)
lda_tfidf_pca = pca.fit_transform(lda_tfidf)


In [14]:
print(data.shape)

(1852, 16)


In [16]:
def find_optimal_clusters(data, max_k):
    inertias = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    return inertias


In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np


# Convert sparse matrix to dense numpy array
vec2word_features_dense = vec2word_features.toarray()
tfidf_features_dense = tfidf_features.toarray()

# Elbow Method for CountVectorizer
max_k = 10
vec2word_inertias = find_optimal_clusters(vec2word_features_dense, max_k)

# Elbow Method for TfidfVectorizer
tfidf_inertias = find_optimal_clusters(tfidf_features_dense, max_k)

# Plotting the Elbow Method for CountVectorizer
plt.plot(range(1, max_k+1), vec2word_inertias, marker='o')
plt.title('Elbow Method for CountVectorizer')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.show()

# Plotting the Elbow Method for TfidfVectorizer
plt.plot(range(1, max_k+1), tfidf_inertias, marker='o')
plt.title('Elbow Method for TfidfVectorizer')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.show()

# Apply clustering algorithms with optimal k
optimal_k_vec2word = np.argmin(vec2word_inertias) + 1
optimal_k_tfidf = np.argmin(tfidf_inertias) + 1

kmeans_vec2word = KMeans(n_clusters=optimal_k_vec2word, random_state=42).fit_predict(vec2word_features_dense)
kmeans_tfidf = KMeans(n_clusters=optimal_k_tfidf, random_state=42).fit_predict(tfidf_features_dense)

# Agglomerative Clustering
agglomerative = AgglomerativeClustering(n_clusters=optimal_k_vec2word).fit_predict(vec2word_features_dense)

# DBSCAN
dbscan = DBSCAN().fit_predict(vec2word_features_dense)  # Using DBSCAN as an example, you can choose another algorithm






In [None]:
import numpy as np
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt

# Calculate silhouette score for KMeans with CountVectorizer
silhouette_kmeans_vec2word = silhouette_score(vec2word_features_dense, kmeans_vec2word)
print("Silhouette Score (KMeans with CountVectorizer):", silhouette_kmeans_vec2word)

# Calculate silhouette score for KMeans with TfidfVectorizer
silhouette_kmeans_tfidf = silhouette_score(tfidf_features_dense, kmeans_tfidf)
print("Silhouette Score (KMeans with TfidfVectorizer):", silhouette_kmeans_tfidf)

# Calculate silhouette score for Agglomerative Clustering with CountVectorizer
silhouette_agglomerative = silhouette_score(vec2word_features_dense, agglomerative)
print("Silhouette Score (Agglomerative Clustering with CountVectorizer):", silhouette_agglomerative)

# Check if dbscan resulted in more than one cluster
if len(np.unique(dbscan)) > 1:
    # Calculate silhouette score for DBSCAN with CountVectorizer
    silhouette_dbscan = silhouette_score(vec2word_features_dense, dbscan)
    print("Silhouette Score (DBSCAN with CountVectorizer):", silhouette_dbscan)
else:
    silhouette_dbscan = None
    print("DBSCAN did not result in more than one cluster.")

# Visualize clusters using silhouette plots
def plot_silhouette(X, cluster_labels):
    silhouette_vals = silhouette_samples(X, cluster_labels)
    num_clusters = len(np.unique(cluster_labels))

    fig, ax = plt.subplots()
    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(X) + (num_clusters + 1) * 10])

    y_lower = 10
    for i in range(num_clusters):
        ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
        ith_cluster_silhouette_vals.sort()
        size_cluster_i = ith_cluster_silhouette_vals.shape[0]
        y_upper = y_lower + size_cluster_i
        color = plt.cm.get_cmap("Spectral")(i / num_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    ax.set_title("Silhouette Plot")
    ax.set_xlabel("Silhouette Coefficient Values")
    ax.set_ylabel("Cluster Label")

# Plot silhouette plot for KMeans with CountVectorizer
plot_silhouette(vec2word_features_dense, kmeans_vec2word)


# Show the plot
plt.show()


In [None]:
# Convert sparse matrix to dense numpy array
vec2word_features_dense = vec2word_features.toarray()

# Davies-Bouldin Index
dbi_kmeans_vec2word = davies_bouldin_score(vec2word_features_dense, kmeans_vec2word)
dbi_agglomerative = davies_bouldin_score(vec2word_features_dense, agglomerative)

print("Davies-Bouldin Index (KMeans with CountVectorizer):", dbi_kmeans_vec2word)
print("Davies-Bouldin Index (Agglomerative Clustering with CountVectorizer):", dbi_agglomerative)




In [24]:
import pandas as pd

# Define a function to assign cluster labels to data points and save it to a CSV file
def assign_clusters_and_save(data, cluster_labels, output_file):
    data_with_clusters = data.copy()
    data_with_clusters['Cluster_Label'] = cluster_labels
    data_with_clusters.to_csv(output_file, index=False)
    print("Data with cluster labels saved to:", output_file)

# Usage example
# data: Your original DataFrame
# cluster_labels: Labels assigned by clustering algorithm
# output_file: Path to save the CSV file
assign_clusters_and_save(data, kmeans_vec2word, "data_with_cluster_labels.csv")


Data with cluster labels saved to: data_with_cluster_labels.csv
