# Automated .csv file import into a PostgreSQL database

## Introduction :

### In PostgreSQL, it is necessary to specify the data types of columns before importing data. This can be more cumbersome, especially when dealing with tables with a large number of columns. The requirement to define the data types in advance can make the import process more time-consuming and error-prone, as it involves accurately specifying the data types for each column.

### However, it's important to note that PostgreSQL's approach of explicitly defining data types offers the advantage of data integrity enforcement and stricter control over the data being imported. It ensures that the imported data aligns with the defined table schema, reducing the risk of data inconsistencies or integrity violations.

In [1]:
import os
import numpy as np
import pandas as pd
import psycopg2

In [2]:
h = os.getcwd()

In [3]:
os.listdir(h)

['.ipynb_checkpoints',
 '.jupyter',
 '.vscode',
 'build',
 'CarPrice_Assignment.csv',
 'confini.ipynb',
 'CONVERT',
 'convert.py',
 'CSVfile_Folder',
 'Customer Contracts$.csv',
 'Customer Demo.csv',
 'Customer Engagements.csv',
 'datasets',
 'dist',
 'DLLs',
 'Doc',
 'etc',
 'FileImport.ipynb',
 'FINALKPEHLE.ipynb',
 'House_Rent_Dataset.csv',
 'h_config.ini',
 'include',
 'Lib',
 'libs',
 'LICENSE.txt',
 'LinearRegression-Copy1.ipynb',
 'LinearRegression-Copy2.ipynb',
 'LinearRegression.ipynb',
 'L_Python.ipynb',
 'main.py',
 'mlregression-house-rent-predictions.ipynb',
 'MOSTIMP.py',
 'NEWS.txt',
 'output33.csv',
 'popularityrent.ipynb',
 'PostgreSQL copy.ipynb',
 'PostgreSQL.ipynb',
 'Project1.py',
 'Project1.py.spec',
 'Project1.spec',
 'PROJECT1README.md',
 'Project2.ipynb',
 'Project2read.ipynb',
 'python.exe',
 'python3.dll',
 'python311.dll',
 'pythonw.exe',
 'Scripts',
 'share',
 'statsfinal.csv',
 'tcl',
 'Tools',
 'transformed_house_rent_data.csv',
 'vcruntime140.dll',
 'vcr

In [4]:
#find CSV files in my current working directory 
#isolate only the CSV files

csv_files = []
for file in os.listdir(os.getcwd()):
    if file.endswith('.csv'):
        csv_files.append(file)

In [5]:
#make a new directory
dataset_dir = 'CSVfile_Folder'

#create the bash command to make a new directory
# mkdir dataset_dir
try:
    mkdir = 'mkdir {0}'.format(dataset_dir)
    os.system(mkdir)
except:
    pass

In [6]:
#move the CSV files in the new directory
import shutil
#mv filename directory
for csv in csv_files:
    src_file = os.path.join(h, csv)  # Full path of the source file
    dst_file = os.path.join(dataset_dir, csv)  # Full path of the destination file
    shutil.move(src_file, dst_file)
    print(f"Moved {csv} to {dst_file}")

Moved CarPrice_Assignment.csv to CSVfile_Folder\CarPrice_Assignment.csv
Moved Customer Contracts$.csv to CSVfile_Folder\Customer Contracts$.csv
Moved Customer Demo.csv to CSVfile_Folder\Customer Demo.csv
Moved Customer Engagements.csv to CSVfile_Folder\Customer Engagements.csv
Moved House_Rent_Dataset.csv to CSVfile_Folder\House_Rent_Dataset.csv
Moved output33.csv to CSVfile_Folder\output33.csv
Moved statsfinal.csv to CSVfile_Folder\statsfinal.csv
Moved transformed_house_rent_data.csv to CSVfile_Folder\transformed_house_rent_data.csv


In [7]:
data_path = h+'/'+dataset_dir+'/'

df = {}  # Use curly braces to initialize an empty dictionary instead of a list
for file in csv_files:
    try:
        df[file] = pd.read_csv(data_path+file)
    except UnicodeDecodeError:
        df[file] = pd.read_csv(data_path+file, encoding="UTF-8")
    print(file)

CarPrice_Assignment.csv
Customer Contracts$.csv
Customer Demo.csv
Customer Engagements.csv
House_Rent_Dataset.csv
output33.csv
statsfinal.csv
transformed_house_rent_data.csv


In [8]:
for k in csv_files:
    dataframe = df[k]
    clean_tbl_name =  k.lower().replace(" ","_").replace("?","") \
                        .replace("-","_").replace(r"/","_").replace("\\","_").replace("%","") \
                        .replace(")","").replace(r"(","").replace("$","")

    # remove.csv extension from clean_tbl_name
    tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])
    print(tbl_name)

    #clean table columns
    dataframe.columns = [x.lower().replace(" ","_").replace("?","") \
                        .replace("-","_").replace(r"/","_").replace("\\","_").replace("%","") \
                        .replace(")","").replace(r"(","").replace("$","") for  x in dataframe.columns]
    

    #replacement dictionary that maps pandas dtypes to sql dtypes
    replacements = {
        'object' : 'varchar', 'float64' : 'float',
        'int64' : 'int',
        'datetime64' : 'timestamp',
        'timedelta64 [ns]' : 'varchar'
    }
    #table schema
    col_str = ", ".join("{} {}".format (n, d) for (n, d) in zip(dataframe.columns, dataframe.dtypes.replace(replacements))) 
    print(col_str)

   
    #To connect with database
    
    # Read configuration file
    import configparser
    config = configparser.ConfigParser()
    config.read('h_config.ini')
    
    # Retrieve database credentials
    username = config.get('postgresql', 'username')
    password = config.get('postgresql', 'password')
    host = config.get('postgresql', 'host')
    dbname = config.get('postgresql', 'dbname')
  
    conn = psycopg2.connect(
        user=username,
        password=password,
        host=host,
        database=dbname
    )
    
    # Your code to work with the database goes here
    
    
    # conn_string = "host=%s dbname=%s user=%s password=%s" % (host, dbname, username, password)
    # conn = psycopg2.connect(conn_string)
    cursor = conn.cursor()
    print('opened database successfully')

    #drop table with same name 
    cursor.execute("drop table if exists %s;" % (tbl_name))
    
    #create table
    query = 'CREATE TABLE "%s" (%s);'
    cursor.execute(query % (tbl_name, col_str))

    #cursor.execute("create table %s (%s);" % (tbl_name, col_str))
    print('{0} was created successfully'.format(tbl_name))
    
    #insert values to table
    
    #save df to csv
    dataframe.to_csv (k, header=dataframe.columns, index=False, encoding='UTF-8')
    
    #open the csv file, save it as an object
    my_file = open(k)
    print('file opened in memory')

    #upload to db
    
    SQL_STATEMENT = """
    COPY %s FROM STDIN WITH
        CSV
        HEADER
        DELIMITER AS ','
    """
    
    cursor.copy_expert (sql=SQL_STATEMENT % tbl_name, file=my_file)
    print('file copied to db')

    cursor.execute("grant select on table %s to public" % tbl_name)
    conn.commit()
    
    cursor.close()
    print('table {0} imported to db completed'.format(tbl_name))

#for loop end message
print('all tables have been successfully imported into the db')

carprice_assignment
car_id int, symboling int, carname varchar, fueltype varchar, aspiration varchar, doornumber varchar, carbody varchar, drivewheel varchar, enginelocation varchar, wheelbase float, carlength float, carwidth float, carheight float, curbweight int, enginetype varchar, cylindernumber varchar, enginesize int, fuelsystem varchar, boreratio float, stroke float, compressionratio float, horsepower int, peakrpm int, citympg int, highwaympg int, price float
opened database successfully
carprice_assignment was created successfully
file opened in memory
file copied to db
table carprice_assignment imported to db completed
customer_contracts
customer_name varchar, start_date varchar, end_date varchar, contract_amount_m float, invoice_sent varchar, paid varchar
opened database successfully
customer_contracts was created successfully
file opened in memory
file copied to db
table customer_contracts imported to db completed
customer_demo
customer_id int, customer_name varchar, employe

In [9]:
# In PostgreSQL, column names cannot contain certain special characters, such as colons (":"). The error is caused by the colon in the column name "unnamed:_0".
#To fix this issue, you should choose a valid column name that follows the naming rules:

#1.Column names must start with a letter or an underscore (_).
#2.Subsequent characters can include letters, digits, and underscores.
#3.Avoid using special characters or spaces in column names.