# Fixing name formating for wikipedia search
This is a band aid fix for the big data management problem I have to tackle next

In [103]:
#IMPORTS
import pandas as pd

In [104]:
#Get data
df = pd.read_csv('../data/raw/autores.csv')
print(df.head())

                autor  id_autor
0     Masad,   Ilana.         2
1    Litman,   Harry.         3
2     Hwang,   Suein.         4
3   Bouie,   Jamelle.         5
4  Proulx,   Natalie.         6


In [105]:
names = df['autor'].str.split(', ', expand=True)

# 1) split col 1 into “first” and “after” parts
split1 = names[1].str.split(';', n=1, expand=True)
# split1[0] = before “;”, split1[1] = after “;” (or NaN)

# 2) overwrite col 1 with the part before “;”
names[1] = split1[0].str.strip()

# 3) merge “after” with the old col 2
#    – if there was an “after”, prepend it plus “; ”  
#    – if not, just keep the old col 2
names[2] = (
    split1[1].fillna('')        # the “after” part or empty
    .str.strip()                # trim stray spaces
    .where(                     # where it’s non-empty
        split1[1].notna(),
        ''                      # otherwise empty
    )
    .add(
        pd.Series(
            '; ' + names[2].fillna(''), 
            index=names.index
        )
    )
    .str.strip('; ')            # remove leading/trailing “; ”
)

# 4) drop the helper split1 if you like
del split1


names.head()


Unnamed: 0,0,1,2
0,Masad,Ilana.,
1,Litman,Harry.,
2,Hwang,Suein.,
3,Bouie,Jamelle.,
4,Proulx,Natalie.,


In [106]:
def normalizeNames(row):
    if row[1] != None:
        ret = str(row[1]).strip('.;') +" "+ str(row[0]).strip(';. ')
        if row[2] != None:
            return ret + " "+str(row[2]).strip(';.')
        else:
            return ret
    else:
        return row[0]
names[0] = names.apply(normalizeNames, axis=1)
del names[1]
del names[2]
names.head()

Unnamed: 0,0
0,Ilana Masad
1,Harry Litman
2,Suein Hwang
3,Jamelle Bouie
4,Natalie Proulx


In [107]:
# add again autor_id column to names
names['author_id'] = df['id_autor']
names = names.rename(columns={0: 'author_name'})
# remove space at the end of the author_name
names['author_name'] = names['author_name'].str.strip()
# remove all white spaces and replace them with a single space
names['author_name'] = names['author_name'].str.replace(r'\s+', ' ', regex=True)
# replace tab with space
names['author_name'] = names['author_name'].str.replace('\t', ' ')

names.head()

Unnamed: 0,author_name,author_id
0,Ilana Masad,2
1,Harry Litman,3
2,Suein Hwang,4
3,Jamelle Bouie,5
4,Natalie Proulx,6


In [108]:
#drop duplicates
names = names.drop_duplicates(subset=['author_id'], keep='first')
names.head()

Unnamed: 0,author_name,author_id
0,Ilana Masad,2
1,Harry Litman,3
2,Suein Hwang,4
3,Jamelle Bouie,5
4,Natalie Proulx,6


In [109]:
#save to csv
names.to_csv('../data/raw/authors_normalized.csv', index=False)