In [3]:
import pandas as pd
import numpy as np
from collections import Counter

HEART = "♥"
zot_df = pd.read_csv('./data/zotraw.csv')

# Removing columns
zot_df = zot_df.dropna(axis=1, how='all')
zot_df = zot_df.drop(columns=["Key","ISBN","ISSN","Link Attachments","Automatic Tags","Series Editor","Edition","Running Time","File Attachments","Short Title","Journal Abbreviation","Series Number","Call Number","Extra","Conference Name","Num Pages","Series","Type","Rights","Place","Programming Language","Editor","Number","Pages","Issue","Access Date"])

# Tags into list
zot_df["Manual Tags"] = zot_df["Manual Tags"].fillna("").str.split(";")

# Making hearts column
def count_hearts(x):
    if isinstance(x, list):
        heart_count = sum(item.count(HEART) for item in x)
        return heart_count if heart_count > 0 else np.nan
    return np.nan

zot_df["Hearts"] = zot_df["Manual Tags"].apply(count_hearts)

# Remove hearts tag
def remove_hearts(x):
    if isinstance(x, list):
        return [item for item in x if not(HEART in item)]
    return x

zot_df["Manual Tags"] = zot_df["Manual Tags"].apply(remove_hearts)
zot_df["Manual Tags"] = zot_df["Manual Tags"].apply(lambda x: ";".join(x))

# Flatten the list of tags
all_tags = [tag for sublist in zot_df["Manual Tags"].str.split(";") for tag in sublist if tag != ""]

# Count the occurrences of each tag
tag_counts = Counter(all_tags)

total_tags = sum(tag_counts.values())

filtered_tag_counts = {tag: count for tag, count in tag_counts.items() if count >= 8}

# Sort the tags by frequency
sorted_tag_counts = dict(sorted(filtered_tag_counts.items(), key=lambda item: item[1], reverse=True))

# Create a new column for the most common tags
def get_common_tags(tags):
    if isinstance(tags, str):
        tags_list = tags.split(";")
        common_tags = [tag for tag in tags_list if tag in sorted_tag_counts]
        return ";".join(common_tags)
    return ""

zot_df["Common Tags"] = zot_df["Manual Tags"].apply(get_common_tags)

# Export to new CSV
zot_df.to_csv("./data/zot_clean.csv", index=False)

In [4]:
zot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item Type          924 non-null    object 
 1   Publication Year   882 non-null    float64
 2   Author             822 non-null    object 
 3   Title              924 non-null    object 
 4   Publication Title  343 non-null    object 
 5   DOI                304 non-null    object 
 6   Url                836 non-null    object 
 7   Abstract Note      844 non-null    object 
 8   Date               882 non-null    object 
 9   Date Added         924 non-null    object 
 10  Date Modified      924 non-null    object 
 11  Volume             246 non-null    float64
 12  Publisher          408 non-null    object 
 13  Language           723 non-null    object 
 14  Library Catalog    840 non-null    object 
 15  Manual Tags        924 non-null    object 
 16  Hearts             403 non

In [5]:
zot_df["Manual Tags"] 

0                      active learning; machine learning
1                                       machine learning
2                                       machine learning
3      monte carlo; linear systems; random linear alg...
4      monte carlo; linear systems; random linear alg...
                             ...                        
919                               reinforcement learning
920                                                  DMD
921                                    persons; DMD; ROM
922                                              caching
923                                              caching
Name: Manual Tags, Length: 924, dtype: object

In [6]:
zot_df.head()

Unnamed: 0,Item Type,Publication Year,Author,Title,Publication Title,DOI,Url,Abstract Note,Date,Date Added,Date Modified,Volume,Publisher,Language,Library Catalog,Manual Tags,Hearts,Common Tags
0,report,2009.0,"Settles, Burr",Active Learning Literature Survey,,,https://minds.wisconsin.edu/handle/1793/60660,The key idea behind active learning is that a ...,2009,2022-09-17 13:55:35,2024-07-28 20:21:19,,University of Wisconsin-Madison Department of ...,en,minds.wisconsin.edu,active learning; machine learning,1.0,machine learning
1,preprint,2020.0,"Baier, Lucas; Kellner, Vincent; Kühl, Niklas; ...",Switching Scheme: A Novel Approach for Handlin...,,,http://arxiv.org/abs/2011.02738,Machine learning models nowadays play a crucia...,2020-11-05,2022-09-17 15:26:45,2024-07-28 20:21:29,,arXiv,en,arXiv.org,machine learning,1.0,machine learning
2,preprint,2022.0,"Mayaki, Mansour Zoubeirou A.; Riveill, Michel",Autoregressive based Drift Detection Method,,,http://arxiv.org/abs/2203.04769,"In the classic machine learning framework, mod...",2022-03-09,2022-09-17 15:33:30,2024-07-28 20:21:32,,arXiv,en,arXiv.org,machine learning,1.0,machine learning
3,journalArticle,2009.0,"Sabelfeld, K.; Mozartova, N.",Sparsified Randomization Algorithms for large ...,Monte Carlo Methods and Applications,10.1515/MCMA.2009.015,https://www.degruyter.com/document/doi/10.1515...,Sparsiﬁed Randomization Monte Carlo (SRMC) alg...,2009-01,2022-09-17 17:35:35,2024-07-28 20:30:28,15.0,,en,DOI.org (Crossref),monte carlo; linear systems; random linear alg...,3.0,monte carlo; linear systems; random linear alg...
4,journalArticle,2017.0,"Benzi, Michele; Evans, Thomas M.; Hamilton, St...",Analysis of Monte Carlo accelerated iterative ...,Numerical Linear Algebra with Applications,10.1002/nla.2088,https://onlinelibrary.wiley.com/doi/10.1002/nl...,We consider hybrid deterministic-stochastic it...,2017-05,2022-09-17 17:35:37,2024-07-28 20:24:15,24.0,,en,DOI.org (Crossref),monte carlo; linear systems; random linear alg...,1.0,monte carlo; linear systems; random linear alg...
