In [48]:
import pandas as pd
import re
import psycopg2
from tqdm import tqdm

In [49]:
# set up database connection
user = 'lindsay'          
host = 'localhost'
dbname = 'podcast'
con = None
con = psycopg2.connect(database = dbname, user = user)
cursor = con.cursor()

In [50]:
# get podcast names
query = "SELECT name, id FROM podcast;"
cursor.execute(query)
query_results = cursor.fetchall()

In [51]:
results_df = pd.DataFrame({'name' : [x[0] for x in query_results],
                           'id' : [x[1] for x in query_results]})

In [52]:
results_df.shape

(6136, 2)

In [53]:
# remove annoying characters
chars = {
    '\xc2\x82' : ',',        # High code comma
    '\xc2\x84' : ',,',       # High code double comma
    '\xc2\x85' : '...',      # Tripple dot
    '\xc2\x88' : '^',        # High carat
    '\xc2\x91' : '\x27',     # Forward single quote
    '\xc2\x92' : '\x27',     # Reverse single quote
    '\xc2\x93' : '\x22',     # Forward double quote
    '\xc2\x94' : '\x22',     # Reverse double quote
    '\xc2\x95' : ' ',
    '\xc2\x96' : '-',        # High hyphen
    '\xc2\x97' : '--',       # Double hyphen
    '\xc2\x99' : ' ',
    '\xc2\xa0' : ' ',
    '\xc2\xa6' : '|',        # Split vertical bar
    '\xc2\xab' : '<<',       # Double less than
    '\xc2\xbb' : '>>',       # Double greater than
    '\xc2\xbc' : '1/4',      # one quarter
    '\xc2\xbd' : '1/2',      # one half
    '\xc2\xbe' : '3/4',      # three quarters
    '\xca\xbf' : '\x27',     # c-single quote
    '\xcc\xa8' : '',         # modifier - under curve
    '\xcc\xb1' : ''          # modifier - under line
}
def replace_chars(match):
    char = match.group(0)
    return chars[char]

def clean_name(text):
    text = re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)
    text = text.decode('utf-8').encode('ascii', 'ignore')
    return text

In [54]:
results_df['clean_name'] = [clean_name(x) for x in results_df['name']]

In [55]:
# update table
query = """
UPDATE podcast
SET clean_name = (%s)
WHERE id = (%s);
"""
query = query.replace('\n', ' ')

for ind, row in tqdm(results_df.iterrows(), total=results_df.shape[0]):
    data = ((row['clean_name'], ), (row['id'], ))
    cursor.execute(query, data)
    
con.commit()



In [47]:
results_df.shape

(6136, 3)