In [1]:
from pathlib import Path

path_merged = Path('.') / "merged" / "merged_data"
path_wiley = Path('.') / "merged" / "wiley_scraped_data"
path_scraped = Path('.') / "merged" / "scraped_data"
path_infuklim = Path('.') / "merged" / "infuklim_scraped_data"
path_oxf = Path('.') / "merged" / "oxf_scraped_data"

scraped_paths = [path_infuklim, path_oxf, path_scraped, path_wiley]

path_target = Path(".") / "data_with_abstracts_v1"

### Clean Scraped Data

In [2]:
import pandas as pd

scraped_dfs = []
for path in scraped_paths:
    with open(path, "r") as f:
        scraped_dfs.append(pd.read_json(f))

In [3]:
#Combine

combined_scraped_data = pd.concat(scraped_dfs, ignore_index=True)

In [4]:
#Remove all abstracts that remained unchanged
combined_scraped_data = combined_scraped_data.loc[pd.isna(combined_scraped_data["abstract"])==False]

In [5]:
combined_scraped_data["abstract"]

0       Capsule Clutches of hole-nesting passerines su...
1       1 \nCorridors provide important structural con...
2       1. We suggest that two important hypotheses un...
3       1 \nThe capercaillie Tetrao urogallus and blac...
4       The maned sloth is a poorly known species ende...
                              ...                        
5152    Cutting and removal of oil-impacted marsh plan...
5539    Abstract  Fish sanctuaries are types of aquati...
5550    In regulated rivers, canalization and reduced ...
5555    Abstract  Floodplain borrow pits created durin...
5580    Road escape ramps are structures developed in ...
Name: abstract, Length: 4340, dtype: object

In [6]:
import re
pattern = re.compile("^\s*Abstract\s*")

In [7]:
combined_scraped_data.loc[:, "abstract"] = combined_scraped_data.loc[:, "abstract"].apply(lambda x: pattern.sub("", x))

In [12]:
# Add Tag Column (Scraped)
combined_scraped_data["abstract_origin"] = "scraped"

Get data from repos

In [14]:
with path_merged.open("r") as f:
    merged_repo_data = pd.read_json(f)

In [15]:
merged_repo_data["abstract_origin"] = "repo"

In [24]:
all_data = pd.concat([combined_scraped_data, merged_repo_data]).sort_values(by="index")

In [26]:
cols = set(all_data.columns) - set(["authors"])
all_data.drop_duplicates(subset=cols, inplace=True, ignore_index=True)

In [48]:
cleaned_data = []
cnt=0

def merge_rows(rows):
    rows = [row.to_dict() for row in rows]
    row1 = rows[0]
    row2 = rows[1]
    if not pd.isna(row1['abstract']):
        return row1
    elif not pd.isna(row2['abstract']):
        return row2
    else:
        return row1

In [46]:
grouped = all_data.groupby(by="index")

In [49]:
total = []
for key, lsInd in grouped.groups.items():
    to_merge = [all_data.loc[ind] for ind in lsInd]
    total.append(to_merge[0].to_dict() if len(to_merge)==1 else merge_rows(to_merge))

Clean Data

In [50]:
combined_dedup_data = pd.DataFrame(total) 

In [52]:
sum(pd.isna(combined_dedup_data["abstract"])==False)

17384

In [63]:
combined_dedup_data

Unnamed: 0,title,authors,doi,publication_date,abstract,repo_identifier,language,publisher,journal_name,journal_volume,journal_issue,issn,url,index,abstract_origin
0,The Management of Grass Pastures for Brent Geese,"[J. A.Vickery, W. J.Sutherland, S. J.Lane]",10.2307/2404543,1994-5,An increasing number of brent geese now feed i...,crossref,EN,JSTOR,The Journal of Applied Ecology,31,2,0021-8901,http://dx.doi.org/10.2307/2404543,0,repo
1,Control of Molinia caerulea on upland moors,"[R. H.Marrs, J. D. P.Phillips, P. A.Todd, J.Gh...",10.1111/j.0021-8901.2004.00901.x,2004-4,Summary 1 Molinia encroachment has been viewed...,crossref,EN,Wiley,Journal of Applied Ecology,41,2,0021-8901,http://dx.doi.org/10.1111/j.0021-8901.2004.009...,2,repo
2,Long-distance relocation of nestboxes reduces ...,"[AlbertoSorace, FabrizioPetrassi, CarloConsiglio]",10.1080/00063650409461343,2004-7,Capsule Clutches of hole-nesting passerines su...,crossref,EN,Informa UK Limited,,51,2,0006-3657,http://dx.doi.org/10.1080/00063650409461343,3,scraped
3,Reducing the density of breeding gulls influen...,"[S. K.Finney, M. P.Harris, L. F.Keller, D. A.E...",10.1046/j.1365-2664.2003.00810.x,2003-6,Summary 1 By acting as both competitors and pr...,crossref,EN,Wiley,,40,3,0021-8901,http://dx.doi.org/10.1046/j.1365-2664.2003.008...,4,repo
4,Elements that promote highway crossing structu...,"[WayneMcDonald, Colleen CassadySt Clair]",10.1111/j.1365-2664.2004.00877.x,2004-2-12,1 \nCorridors provide important structural con...,crossref,EN,Wiley,,41,1,0021-8901,http://dx.doi.org/10.1111/j.1365-2664.2004.008...,5,scraped
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18188,Historical peat loss explains limited short-te...,"[Jennifer Williamson, David Norris, Chris D. E...",10.1016/j.jenvman.2016.12.018,2017-3,Abstract This study assessed the short-term im...,openaire,EN,Elsevier BV,Journal of Environmental Management,188,,0301-4797,http://dx.doi.org/10.1016/j.jenvman.2016.12.018,20893,repo
18189,Wood chip soil amendments in restored wetlands...,"[Evan C.Wolf, EliškaRejmánková, David J.Cooper]",10.1111/rec.12942,2019-4-2,Adding chipped wood to soil ameliorates compac...,crossref,EN,Wiley,,27,5,1061-2971,http://dx.doi.org/10.1111/rec.12942,20894,scraped
18190,Is the cutting of oil contaminated marshes an ...,"[André L.T.O.Wolinski, Paulo C.Lana, LeonardoS...",10.1016/j.marpolbul.2011.03.024,2011-6,Cutting and removal of oil-impacted marsh plan...,crossref,EN,Elsevier BV,,62,6,0025-326X,http://dx.doi.org/10.1016/j.marpolbul.2011.03.024,20895,scraped
18191,Restoration of inland brackish vegetation by l...,"[MinekeWolters, Saskiade Vries, Wim A.Ozinga, ...",10.1111/avsc.12323,2017-8-20,Question Does large-scale transfer of coastal ...,crossref,EN,Wiley,Applied Vegetation Science,20,4,1402-2001,http://dx.doi.org/10.1111/avsc.12323,20896,repo


### Remove leading Abstract / Summary things

In [56]:
import re
pattern = re.compile("^\s*Abstract\s*|^\s*Summary\s*")

In [64]:
combined_data_w_abstracts = combined_dedup_data.loc[pd.isna(combined_dedup_data["abstract"])==False]

In [65]:
combined_data_w_abstracts.reset_index(inplace=True, drop=True)

Delete all Abstract <250 characters long and clean text

In [77]:
from ftfy import fix_text

In [81]:
cnt = 0
for ind, row in combined_data_w_abstracts.iterrows():
    abst = row.to_dict()["abstract"]
    if len(abst)<250:
        combined_data_w_abstracts.loc[ind, "abstract"] = None
        combined_data_w_abstracts.loc[ind, "abstract_origin"] = "invalid_repo"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [84]:
invalid_repo = combined_data_w_abstracts.loc[combined_data_w_abstracts['abstract_origin']=="invalid_repo"]

USED WEBSCRAPING (slow_query for result) -- Find in abstractWebscraping.py

In [102]:
# result = slow_query(invalid_repo, 1.3)

In [110]:
for ind, row in result.iterrows():
    combined_data_w_abstracts.loc[ind, :] = row

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


Combined Data with abstracts!
Clean all Abstracts!

In [118]:
from ftfy import  fix_text
for ind, row in combined_data_w_abstracts.iterrows():
    combined_data_w_abstracts.loc[ind, "title"] = fix_text(row["title"])
    combined_data_w_abstracts.loc[ind, "abstract"] = fix_text(row["abstract"]) if row['abstract'] is not None else None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [119]:
for ind, row in combined_dedup_data.loc[pd.isna(combined_dedup_data["abstract"])==True].iterrows():
    combined_dedup_data.loc[ind, "title"] = fix_text(combined_dedup_data.loc[ind, "title"])

Recombine them with the no abstracts from above!

In [120]:
total_cleaned_data = pd.concat([combined_dedup_data.loc[pd.isna(combined_dedup_data["abstract"])==True], combined_data_w_abstracts])

In [136]:
total_cleaned_data = total_cleaned_data.sort_values(by="index")

In [139]:
total_cleaned_data["topic_classification"] = None 

Combine them with their classifiers

In [126]:
cleaned_references = pd.read_csv("cleaned_references.csv")

In [145]:
cleaned_references.loc[9]

index                                                         25
item_type                                                journal
pub_year                                                  2000.0
authors                             Bokdam, J.; Gleichman, J. M.
title          Effects of grazing by free-ranging cattle on v...
pub_title                             Journal of Applied Ecology
isbn                                                         NaN
issn                                                         NaN
doi                                                          NaN
url                                                          NaN
pages                                                    415-431
issue                                                        NaN
volume                                                        37
data_source                                           CE_website
topics                                                 Shrubland
Name: 9, dtype: object

In [141]:
for ind, row in total_cleaned_data.iterrows():
    total_cleaned_data.loc[ind, "topic_classification"] = cleaned_references.loc[row["index"],"topics"]

In [152]:
total_cleaned_data.reset_index(drop=True, inplace=True)

In [153]:
total_cleaned_data.to_json("positive_samples_v1_160221")

In [154]:
total_cleaned_data.loc[pd.isna(total_cleaned_data["abstract"])==False].to_json("positive_samples_only_with_abstract_v1_160221")