In [1]:
from sqlalchemy.engine import create_engine
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus
import pandas as pd

In [2]:
# Create the sqlalchemy engine and connection
username = "root"
password = "root" 
# password = quote_plus("Myp@ssword!") # Use the quote function if you have special chars in password
db_name = "hr"
connection = f"mysql+pymysql://{username}:{password}@localhost/{db_name}"
engine = create_engine(connection)
conn = engine.connect()

In [3]:
# Preview the names of all tables 
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_hr
0,department
1,employee
2,job
3,job_history


In [5]:
# Read in the employee data and preview
employees = pd.read_csv('Raw Data/Employee-data.csv')
employees.info()
employees.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   employee_id  8 non-null      object
 1   f_name       8 non-null      object
 2   l_name       8 non-null      object
 3   ssn          8 non-null      int64 
 4   birth_date   8 non-null      object
 5   sex          8 non-null      object
 6   address      8 non-null      object
 7   job_id       8 non-null      int64 
 8   salary       8 non-null      int64 
 9   manager_id   8 non-null      int64 
 10  dep_id       8 non-null      int64 
dtypes: int64(5), object(6)
memory usage: 832.0+ bytes


Unnamed: 0,employee_id,f_name,l_name,ssn,birth_date,sex,address,job_id,salary,manager_id,dep_id
0,E1001,John,Thomas,123456,01/09/1976,M,"5631 Rice, OakPark,IL",100,100000,30001,2
1,E1002,Alice,James,123457,07/31/1972,F,"980 Berry ln, Elgin,IL",200,80000,30002,5
2,E1003,Steve,Wells,123458,08/10/1980,M,"291 Springs, Gary,IL",300,50000,30002,5
3,E1004,Santosh,Kumar,123459,07/20/1985,M,"511 Aurora Av, Aurora,IL",400,60000,30004,5
4,E1005,Ahmed,Hussain,123410,01/04/1981,M,"216 Oak Tree, Geneva,IL",500,70000,30001,2


Preparing the Data with Pandas
- To upload data to the pre-existing EMPLOYEE table, we need to make sure that the column names and data types of our DataFrame are compatible with the table's names and data types,
- We can run the "DESCRIBE TABLE" command to learn more about a specific table.

In [6]:
q = '''DESCRIBE employee;'''
describe = pd.read_sql(q, conn)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,EMP_ID,varchar(9),NO,PRI,,
1,F_NAME,varchar(45),YES,,,
2,L_NAME,varchar(45),YES,,,
3,SSN,char(9),YES,,,
4,B_DATE,date,YES,,,
5,SEX,char(1),YES,,,
6,ADDRESS,varchar(45),YES,,,
7,JOB_ID,char(9),NO,MUL,,
8,SALARY,"decimal(10,2)",YES,,,
9,MANAGER_ID,char(9),YES,,,


The result of DESCRIBE provides a lot of information.

For each column, it tells us:- 

the field/column n- ame
the datatype, including - size
Whether Null values are perm- itted
If the column serves as a key. It cou    - ld be:
PRI: Prim    - ary Key
UNI: A unique key, but not set as the prim    - ary key.
MUL: Foreign key with multipl- e matches
the Defa    - ult Value:
Note that created_at and updated_at both have "CURRENT_TIMESTAMP" as t- he Default.
Extra com    - mands/notes.
Note that the updated_at field indicates that it will be updated with the current timestamp w
hen modified.
This information will ensure our DataFrame has the correct field names and datatypes before inserting the data into our pre-exi
sting database.

In [7]:
# Checking describe's Field names
describe['Field'].values

array(['EMP_ID', 'F_NAME', 'L_NAME', 'SSN', 'B_DATE', 'SEX', 'ADDRESS',
       'JOB_ID', 'SALARY', 'MANAGER_ID', 'DEP_ID', 'created_at',
       'updated_at'], dtype=object)

In [8]:
# Checking dataframe's columns
employees.columns

Index(['employee_id', 'f_name', 'l_name', 'ssn', 'birth_date', 'sex',
       'address', 'job_id', 'salary', 'manager_id', 'dep_id'],
      dtype='object')

It looks like all of the DataFrame's column names are lowercase, but the database has uppercase.
- 
We will want to convert our dataframe columns to upper case
.
We also can see that the "EMP_ID" column in the database is called "employee_id" in the DataFrame, and the "B_DATE" column in the database is called "birth_date" in the DataFrame- .

We will want to rename our columns to match the database.

In [9]:
# convert the column names to uppercase using .str.upper()
employees.columns.str.upper()

Index(['EMPLOYEE_ID', 'F_NAME', 'L_NAME', 'SSN', 'BIRTH_DATE', 'SEX',
       'ADDRESS', 'JOB_ID', 'SALARY', 'MANAGER_ID', 'DEP_ID'],
      dtype='object')

In [10]:
# replace original column names
employees.columns = employees.columns.str.upper()
employees.head(2)

Unnamed: 0,EMPLOYEE_ID,F_NAME,L_NAME,SSN,BIRTH_DATE,SEX,ADDRESS,JOB_ID,SALARY,MANAGER_ID,DEP_ID
0,E1001,John,Thomas,123456,01/09/1976,M,"5631 Rice, OakPark,IL",100,100000,30001,2
1,E1002,Alice,James,123457,07/31/1972,F,"980 Berry ln, Elgin,IL",200,80000,30002,5


In [11]:
# Rename columns to match SQL table
rename_map = {"EMPLOYEE_ID":"EMP_ID",
             "BIRTH_DATE":"B_DATE"}
employees = employees.rename(rename_map,axis=1)
employees.head(2)

Unnamed: 0,EMP_ID,F_NAME,L_NAME,SSN,B_DATE,SEX,ADDRESS,JOB_ID,SALARY,MANAGER_ID,DEP_ID
0,E1001,John,Thomas,123456,01/09/1976,M,"5631 Rice, OakPark,IL",100,100000,30001,2
1,E1002,Alice,James,123457,07/31/1972,F,"980 Berry ln, Elgin,IL",200,80000,30002,5


Checking the Field/Column Data Types

Next, we will compare the datatypes in the database vs. our DataFrame. Note that the datatypes will not be an exact match but a general match. For example, SQL's VARCHAR is equivalent to "object" or "string."

SQL's Decimal = pandas' float
SQL's date = pandas' datetime
etc.

In [12]:
# Reviewing SQL table's data types
describe[['Field','Type']]

Unnamed: 0,Field,Type
0,EMP_ID,varchar(9)
1,F_NAME,varchar(45)
2,L_NAME,varchar(45)
3,SSN,char(9)
4,B_DATE,date
5,SEX,char(1)
6,ADDRESS,varchar(45)
7,JOB_ID,char(9)
8,SALARY,"decimal(10,2)"
9,MANAGER_ID,char(9)


In [13]:
# Reviewing dataframe's data types
employees.dtypes

EMP_ID        object
F_NAME        object
L_NAME        object
SSN            int64
B_DATE        object
SEX           object
ADDRESS       object
JOB_ID         int64
SALARY         int64
MANAGER_ID     int64
DEP_ID         int64
dtype: object

We can see that the B_DATE is an object but needs to be datetime.

In [14]:
# Converting B_DATE to datetime dtype
employees['B_DATE'] = pd.to_datetime(employees['B_DATE'])
employees.dtypes

EMP_ID                object
F_NAME                object
L_NAME                object
SSN                    int64
B_DATE        datetime64[ns]
SEX                   object
ADDRESS               object
JOB_ID                 int64
SALARY                 int64
MANAGER_ID             int64
DEP_ID                 int64
dtype: object

Now that we have modified our DataFrame to match the table in our SQL database, we are ready to insert the data into the SQL table.



df.to_sql()

Just as Pandas has both a pd.read_csv() function and a df.to_csv() method, it also has a pd.read_sql() function (which you have been using) AND a df.to_sql() method.

We will use the .to_sql method to add our data, but we must make sure to select the correct arguments for the task.

- name: it needs the name of the table to insert the data into
- 
con: it needs the connection to the database from sqlalchem

- 
schema: It does not require that we specify a schema for MySQL
- .
if_exists: By default, if our table already exists, it will fail, and we will get an erro    - r.
This is not what we want! We want to add our data TO the table if it already exis    - ts.
The options for if_exists         - are:
fail: Raise a ValueE        - rror.
replace: Drop the table before inserting new v        - alues.
append: Insert new values to the existing    -  table.
Since we want to add our data to the database, will use if_exists=

- 'append'
index: it includes the DataFrame index as a column for the database.
We will generally want to have a plain pandas integer index for our DataFrame and use index=False.

# Inserting Data into the EMPLOPYEE Table 

In [17]:
employees.to_sql("employee",conn,index=False, if_exists='append')

IntegrityError: (pymysql.err.IntegrityError) (1452, 'Cannot add or update a child row: a foreign key constraint fails (`hr`.`employee`, CONSTRAINT `fk_EMPLOYEE_JOB_HISTORY1` FOREIGN KEY (`EMP_ID`) REFERENCES `job_history` (`EMPL_ID`))')
[SQL: INSERT INTO employee (`EMP_ID`, `F_NAME`, `L_NAME`, `SSN`, `B_DATE`, `SEX`, `ADDRESS`, `JOB_ID`, `SALARY`, `MANAGER_ID`, `DEP_ID`) VALUES (%(EMP_ID)s, %(F_NAME)s, %(L_NAME)s, %(SSN)s, %(B_DATE)s, %(SEX)s, %(ADDRESS)s, %(JOB_ID)s, %(SALARY)s, %(MANAGER_ID)s, %(DEP_ID)s)]
[parameters: ({'EMP_ID': 'E1001', 'F_NAME': 'John', 'L_NAME': 'Thomas', 'SSN': 123456, 'B_DATE': datetime.datetime(1976, 1, 9, 0, 0), 'SEX': 'M', 'ADDRESS': '5631 Rice, OakPark,IL', 'JOB_ID': 100, 'SALARY': 100000, 'MANAGER_ID': 30001, 'DEP_ID': 2}, {'EMP_ID': 'E1002', 'F_NAME': 'Alice', 'L_NAME': 'James', 'SSN': 123457, 'B_DATE': datetime.datetime(1972, 7, 31, 0, 0), 'SEX': 'F', 'ADDRESS': '980 Berry ln, Elgin,IL', 'JOB_ID': 200, 'SALARY': 80000, 'MANAGER_ID': 30002, 'DEP_ID': 5}, {'EMP_ID': 'E1003', 'F_NAME': 'Steve', 'L_NAME': 'Wells', 'SSN': 123458, 'B_DATE': datetime.datetime(1980, 8, 10, 0, 0), 'SEX': 'M', 'ADDRESS': '291 Springs, Gary,IL', 'JOB_ID': 300, 'SALARY': 50000, 'MANAGER_ID': 30002, 'DEP_ID': 5}, {'EMP_ID': 'E1004', 'F_NAME': 'Santosh', 'L_NAME': 'Kumar', 'SSN': 123459, 'B_DATE': datetime.datetime(1985, 7, 20, 0, 0), 'SEX': 'M', 'ADDRESS': '511 Aurora Av, Aurora,IL', 'JOB_ID': 400, 'SALARY': 60000, 'MANAGER_ID': 30004, 'DEP_ID': 5}, {'EMP_ID': 'E1005', 'F_NAME': 'Ahmed', 'L_NAME': 'Hussain', 'SSN': 123410, 'B_DATE': datetime.datetime(1981, 1, 4, 0, 0), 'SEX': 'M', 'ADDRESS': '216 Oak Tree, Geneva,IL', 'JOB_ID': 500, 'SALARY': 70000, 'MANAGER_ID': 30001, 'DEP_ID': 2}, {'EMP_ID': 'E1006', 'F_NAME': 'Nancy', 'L_NAME': 'Allen', 'SSN': 123411, 'B_DATE': datetime.datetime(1978, 2, 6, 0, 0), 'SEX': 'F', 'ADDRESS': '111 Green Pl, Elgin,IL', 'JOB_ID': 600, 'SALARY': 90000, 'MANAGER_ID': 30001, 'DEP_ID': 2}, {'EMP_ID': 'E1007', 'F_NAME': 'Mary', 'L_NAME': 'Thomas', 'SSN': 123412, 'B_DATE': datetime.datetime(1975, 5, 5, 0, 0), 'SEX': 'F', 'ADDRESS': '100 Rose Pl, Gary,IL', 'JOB_ID': 650, 'SALARY': 65000, 'MANAGER_ID': 30003, 'DEP_ID': 7}, {'EMP_ID': 'E1008', 'F_NAME': 'Bharath', 'L_NAME': 'Gupta', 'SSN': 123413, 'B_DATE': datetime.datetime(1985, 5, 6, 0, 0), 'SEX': 'M', 'ADDRESS': '145 Berry Ln, Naperville,IL', 'JOB_ID': 660, 'SALARY': 65000, 'MANAGER_ID': 30003, 'DEP_ID': 7})]
(Background on this error at: https://sqlalche.me/e/14/gkpj)