In [163]:
#import libraries
import os
import re
import numpy as np
import pandas as pd
import psycopg2
from unidecode import unidecode

In [164]:
!dir

 Volume in drive C is Windows
 Volume Serial Number is E807-C1C7

 Directory of C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner

25/07/2023  02:07 pm    <DIR>          .
25/07/2023  05:35 pm    <DIR>          ..
25/07/2023  12:19 pm            22,396 csv_text_cleaner.ipynb
25/07/2023  05:33 pm    <DIR>          datasets
25/07/2023  05:36 pm    <DIR>          raw_csv
24/07/2023  02:18 pm               104 README.md
               2 File(s)         22,500 bytes
               4 Dir(s)  162,756,947,968 bytes free


In [165]:
#make sure directory is in csv_text_cleaner folder
try:
    dir = r"C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner"
    print(dir)
    os.chdir(dir)
    os.system("cd "+dir)
except OSError as e:
    pass  

C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner


In [166]:
#automatically detect csv's in raw_csv directory
#isolate the csv files
csv_dir = f"{dir}\\raw_csv\\"
csv_files = []
for file in os.listdir(csv_dir):
    if file.endswith('.csv'):
        csv_files.append(file)

In [167]:
#make a new directory

dataset_dir = r'C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner\datasets'

#create the bash command to make a new directory
# mkdir dataset_dir
try:
    mkdir = 'mkdir {0}'.format(dataset_dir)
    os.system(mkdir)
except: 
    pass

In [170]:
#copy the csv files to the new directory
#copy <path+filename> <directory>
for csv in csv_files:
    copyfile = f"copy {csv_dir}\"{csv}\" \"{dataset_dir}\\\""
    os.system(copyfile)
    print(f"{csv} copied to {dataset_dir}")

world-data-2023.csv copied to C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner\datasets
worldcities.csv copied to C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner\datasets
world_data_2023.csv copied to C:\Users\Administrator\Documents\jupyter_projects\csv_text_cleaner\datasets


In [171]:
df = {}
for file in csv_files:
    try:
        df[file] = pd.read_csv(f"{dataset_dir}\{file}")
    except UnicodeDecodeError:
         df[file] = pd.read_csv(f"{dataset_dir}\{file}", encoding="ISO-8859-1")
    print(file)

world-data-2023.csv
worldcities.csv
world_data_2023.csv


In [172]:
def text_cleaner(cleaned_string, units_to_remove=None):
    if units_to_remove is None:
        units_to_remove = []

    replace_dict = {
        '-': '_',
        ' ':'_',
        ':':'',
        '&': '',
        '$':'',
        '%':'',
        '!':'',
        '/':'',
        '(': '',
        ')': '',
        '\\': '',
        '\n': '',
    }

    #remove units from the cleaned name
    for unit in units_to_remove:
        cleaned_string = re.sub(re.escape(unit), '', cleaned_string)

    #remove trailing symbols from the cleaned string
    cleaned_string = re.sub(r'^[_-]+|[_-]+$','',cleaned_string)

    #remove non-latin characters and transliterate with nearest equivalent
    cleaned_string = unidecode(cleaned_string)
    
    cleaned_string = cleaned_string.rstrip().lower().translate(str.maketrans(replace_dict))

    return cleaned_string


#replace pandas dataframe datatypes with equivalent SQL datatypes
data_type_replacements = {
    'object':'varchar',
    'float64':'float',
    'int64':'int',
    'datetime64':'timestamp',
    'timedelta64[ns]':'varchar'
}

#PostgreSQL credentials

host = "localhost"
database = "postgres"
user = "postgres"
password = "password"

for k in csv_files:
    dataframe = df[k]
    
    #insert csv file pathway into str
    csv_path = str(k)
    
    #split file pathway to split file name into tuple, containing name( pos [0]) and file extension ( pos [1])
    pathway = os.path.basename(k)
    file_name = os.path.splitext(pathway)


    #apply text_cleaner to title and headers
    
    #clean title
    cleaned_title = text_cleaner(file_name[0])

    #clean headers
    dataframe.columns = [text_cleaner(text, units_to_remove=['pkm2','km2','co2']) for text in dataframe.columns]

    #zip list of headers and corresponding datatypes so they are in format: '..., {header} {datatype}, ...'
    col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(dataframe.columns, dataframe.dtypes.replace(data_type_replacements)))

    #establish a connection to the PostgreSQL server

    try: 
        
        connection = psycopg2.connect(
        host=host,
        database=database,
        user=user,
        password=password
        )
    
        print("Connected to PostgreSQL!")
    except Exception as e:
        print("Error connecting to PostgreSQL", e)
        exit()
    
    #set client encoding
    connection.set_client_encoding('UTF8')
    
    
    #create a cursor to execute SQL queries
    
    cursor = connection.cursor()
    
    #drop table with same name
    
    try:
        cursor.execute(f"drop table if exists {cleaned_title} CASCADE;")
        print(f"{cleaned_title} table dropped if exists.")
    except Exception as e:
        print("Error dropping table", e)
        exit()
        
    #create the table in PostgreSQL
    
    try:
        create_table_query = f"CREATE TABLE {cleaned_title} ({col_str});"
        cursor.execute(create_table_query)
        print(f"{cleaned_title} table created!")
    except Exception as e:
        print("Error creating table", e)
        exit()
    
    #insert the data into the table

    #save df to csv

    dataframe.to_csv(k, header=dataframe.columns, index=False, encoding='utf-8')
    
    #open file in memory
    my_file = open(k, mode ='r', encoding='utf-8')
    print("file opened in memory")
    
    #upload to db
    SQL_STATEMENT = f"""
    COPY {cleaned_title} FROM STDIN WITH
        CSV
        HEADER
        DELIMITER AS ','
    """
    
    cursor.copy_expert(sql=SQL_STATEMENT, file = my_file)
    print("file copied to db")

    #change permissions to public
    cursor.execute(f"GRANT SELECT ON TABLE {cleaned_title} TO public")
    connection.commit()

    cursor.close()
    print(f"table {cleaned_title} import to database completed")

print("all tables have been successfully imported")

Connected to PostgreSQL!
world_data_2023 table dropped if exists.
world_data_2023 table created!
file opened in memory
file copied to db
table world_data_2023 import to database completed
Connected to PostgreSQL!
worldcities table dropped if exists.
worldcities table created!
file opened in memory
file copied to db
table worldcities import to database completed
Connected to PostgreSQL!
world_data_2023 table dropped if exists.
world_data_2023 table created!
file opened in memory
file copied to db
table world_data_2023 import to database completed
all tables have been successfully imported
