In [58]:
import pandas as pd
import re
from unidecode import unidecode

In [3]:
data = pd.read_csv("combined_partially_cleaned_data.csv")

In [4]:
# finding all the rows with unicode issues in affiliation
data[data["affiliation"].str.contains("<.*>")]

Unnamed: 0.1,Unnamed: 0,publisher,journal,editor,affiliation,country,continent
0,0,American Psychological Association,American Journal of Orthopsychiatry,"Jill D. McLeigh, PhD","Children<U+0092>s Health, Dallas, United States",United States,Americas
91,93,American Psychological Association,American Psychologist<U+00AE>,"Janet S. Hyde, PhD","University of Wisconsin<U+0096>Madison, United...",United States,Americas
105,107,American Psychological Association,American Psychologist<U+00AE>,"Nancy L. McElwain, PhD",University of Illinois at Urbana<U+0096>Champa...,United States,Americas
180,187,American Psychological Association,Behavioral Neuroscience<U+00AE>,"Denise J. Cai, PhD<em>","<br>\r\r\r\nNeuroscience Department, Icahn Sch...",United States,Americas
186,193,American Psychological Association,Behavioral Neuroscience<U+00AE>,"<em>Roshan Cools, PhD","Roshan Cools, PhD\r\r\r\n<br>\r\r\r\nDonders I...",Netherlands,Europe
...,...,...,...,...,...,...,...
439429,477318,SAGE,NHRD Network Journal,Ashok Ramachandran,"National Secretary, NHRDN, Gurgaon, India and ...",India,Asia
439562,477459,SAGE,Nordic Studies on Alcohol and Drugs,Lars Fynbo,VIVE <U+0096> The Danish Centre of Applied Soc...,Denmark,Europe
439566,477463,SAGE,Nordic Studies on Alcohol and Drugs,Ditte Andersen,VIVE <U+0096> The Danish Centre of Applied Soc...,Denmark,Europe
439669,477567,SAGE,Nursing Science Quarterly,"Karen Carroll, RN, PhD",Ann & Robert H. Lurie Children<U+0092>s Hospit...,United States,Americas


#### Step 1: Find the...problematic characters

In [5]:
# trying to pull the variations for the unicode issue -> may run slow though :(
unicode = data["affiliation"].str.extract(r'.*(<.*>).*')

In [41]:
unicode.dropna().count()

0    5292
dtype: int64

In [47]:
# head represents ~95% of <> vals within the affiliation column
(1565+1397+603+495+301+150+115+108+61+57+46+42+41+38+26)/5292

0.9533257747543462

In [6]:
# getting an idea of the types of unicode issues we have inside this column
unicode.dropna().value_counts().head(15)

<U+0092>    1565
<U+0096>    1397
<U+00A0>     603
<U+0094>     495
<U+206F>     301
<U+0097>     150
<U+0219>     115
<U+009A>     108
</em>         61
<U+009E>      57
<U+200B>      46
<U+021B>      42
<U+00B4>      41
<br>          38
<U+00B7>      26
dtype: int64

#### Step 2: Making a function and a dictionary of values

In [56]:
# translating the unicode to the others
# this takes care of ~95% of the <.*> messiness within affiliation
unicode_dict = {
    "<U+0092>": "'", # this counts as a '
    "<U+0096>": "-", # this counts as a - -> "start of gaurded area"
    "<U+00A0>": " ", # this counts as a "non-break space" -> " "
    "<U+0094>": '"', # cancel character -> '"'
    "<U+206F>": " ", # "nominal digit shapes" -> " "
    "<U+0097>": "-", # "end of guarded area" -> maps to <U+0096> -> -
    "<U+0219>": unidecode("\u0219"), # latin letter s w/comma underneath it -> maps to an s really
    "<U+009A>": "",# "SINGLE CHARACTER INTRODUCER" -> no direct mapping...
    "</em>": "-", # maps to extended dash
    "<U+009E>": "", # "privacy message" -> no direct mapping...
    "<U+200B>": "" , # "zero width space" -> I'm going to just replace it with a ""
    "<U+021B>": unidecode("\u021B"), # latin letter t w/comma below -> maps to a t really probably
    "<U+00B4>": unidecode("\u00B4"), # maps to acute accent but I'm going to assume they really mean a '
    "<br>": "", # it's a break -> just going to remove it
    "<U+00B7>": unidecode("\u00B7")
}

In [77]:
# defining a cleaning function to apply to each column
def carrot_removal(x):
    # using regex to find each bit
    matches = re.search(r'.*(<.*>).*', x) # searching within x
    
    if matches:
        str_to_repl = matches.group(0) # pulling the 0th match within the group
    
        # checking if the match is within the dict
        if str_to_repl in unicode_dict.keys():
            carrots_removed = x.replace(str_to_repl, unicode_dict[str_to_repl]) # if yes, replacing it w dict val
        else:
            carrots_removed = x.replace(str_to_repl, "") # if not, replacing it with ""
            
    else:
        carrots_removed = x # if no match, then we'll just go ahead and leave x unchaged
    
    return carrots_removed # returning the fixed carrot string



#### Step 3: Applying the function to clean the column

In [81]:
%%time
# cleaning up and removing the carrots
data["affiliation"] = data["affiliation"].apply(carrot_removal)

Wall time: 28.9 s
