<a href="https://colab.research.google.com/github/KudasaiCode/Arrays-with-NumPy-and-Pandas/blob/master/Preparing_Cleaning_and_Tranforming_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dimensions of Data Quality

* Validity: Does the data conform to desired definition?

* Accuracy: How well is the data recorded in relation to the real world?

* Completeness: Do we have all the data?

* Consistency: Are there any differences of 2 or more representations based on the same definition?

* Uniformity: Are we using consistent types of metric or conversion?

* Uniqueness: Do we have 1 or more of the **same** records?

* Timeliness:  Is the data recorded relevant to our analysis?



---



# Creating tables 

In [0]:
from google.colab import files
import sqlite3

In [0]:
uploaded = files.upload()

In [0]:
con = sqlite3.connect('dataDB.db')
cur = con.cursor()

**Creating the tables**

In [0]:
cur.execute("CREATE TABLE Employee(Employee_ID, First_Name TEXT, Last_Name Text, Department TEXT, City TEXT)")

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.execute('CREATE TABLE Supplier(Supplier_ID INTEGERS, Name TEXT, City TEXT)')

<sqlite3.Cursor at 0x7fef5925e3b0>

In [0]:
cur.execute('CREATE TABLE Sales(Employee_ID INTEGERS, First_Name TEXT, Last_Name TEXT, Department TEXT, City TEXT)')

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.execute('CREATE TABLE Customer(Customer_ID INTEGERS, First_Name TEXT, Last_Name TEXT, City TEXT)')

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.execute('CREATE TABLE Product(Product_ID, Product_Name TEXT, Price, Supplier_ID, Supplier_Name)')

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.execute('CREATE TABLE Orders(Order_ID INTEGER, Customer_ID INTEGER, Product_ID INTEGER, Employee_ID INTEGER, Date TEXT)')

<sqlite3.Cursor at 0x7fef5925e3b0>



---



** Creating Values **

In [0]:
employees = [
    (10001, 'Daniel', 'Olsong', 'Electronics', 'San Francisco'),
    (10002, 'Nick', 'Markmen', 'Clothes', 'San Franciso'), 
    (10003, 'Maria', 'Gonzalez', 'Electronics', 'San Jose'),
    (10004, 'George', 'Wong', 'Clothes', 'San Franciso'),
    (10005, 'Stephanie', 'Williams', 'Clothes', 'Oakland'),
    (10006, 'Miguel', 'Alva', 'Clothes', 'Oakland'),
    (10007, 'Jessica', 'Collins', 'Cosmetics', 'San Franciso'),
    (10008, 'Ted', 'Anderson', 'Clothes', 'Oakland'),
    (10009, 'Victoria', 'Garcia', 'Clothes', 'San Jose'),
    (10010, 'Jasmine', 'Khan', 'Parmacy Health & Beauty', 'Oakland'),
    (10011, 'Adam', 'Smith', 'Parmacy Health & Beauty', 'San Jose'),
    (10012, 'Rachel', 'Kim', 'Electronics', 'San Jose'),
    (10012, 'Adam R.', 'West', 'Clothes', 'San Jose'),
    (10012, 'Adam', 'West', 'Clothes', 'San Jose'),
    (10013, 'Kim', 'Ng', '', 'San Francisco'),
    (10014, 'Nicole', '', 'Parmacy Health & Beauty', 'Oakland'),
    ('3k3d8leu', 'James', 'TELLER', 'Clothes', 'SANJOSE,CA'),
    (10014, 'Carie', 'Olson', 'Clothes/Electronics', 'SF'),
    (10015, 'JonathaN_', 'hernandez', 'N/A', 'San Jose'),
]

In [0]:
suppliers = [
    (40001, 'Music Vibrations, Inc', 'Oakland'),
    (40002, 'BioMedInc', 'San Franciso'),
    (40003, 'FashionRUs', 'San Jose'),
    (40004, 'Studio Warehouse', 'San Franciso'),
    (40005, 'CompTech, Inc', 'San Jose'),
    (400006, 'WearUps Inc', 'NY'),
    (40007, 'DataQuery Inc', 'SJ'),
    (40008, 'Ropa Inc', 'Mexico'),
]

In [0]:
sales = [
    (10001, 'Daniel' ,'Olson', 'Electronics', 'San Francisco'),
    (10002, 'Nick','markmen', 'Clothes', 'San Francisco'),
    (10003, 'Maria' ,'Gonzales', 'Electronics', 'San Jose'),
    (10004, 'George' ,'Wong', 'Clothes', 'San Francisco'),
    (10005, 'Stephanie' ,'Williams', 'Clothes', 'Oakland'),
    (10006, 'Miguel' ,'Alva', 'Clothes', 'Oakland'),
    (10007, 'Jessica' ,'Collins', 'Electronics', 'San Francisco'),
    (10008, 'Ted' ,'Anderson', 'Clothes', 'Oakland'),
    (10009, 'Victoria' ,'Garcia', 'Clothes', 'San Jose'),
    (10010, 'Jjasmine' ,'Khan', 'Pharmacy, Halth, and Beauty', 'Oakland'),
    (10011, 'Adam' ,'Smith', 'Pharmacy, Halth, and Beauty', 'San Jose'),
    (10012, 'Rachel' ,'Kim', 'Electronics', 'San Jose'),
]


In [0]:
customers = [
    (20001, 'Jack', 'Ward', 'San Jose'),
    (20002, 'Steven', 'Martinez', 'San Francisco'),
    (20003, 'Jessica', 'Collins', 'San Jose'),
    (20004, 'Carie', 'Robinson', 'San Francisco'),
    (20005, 'Zack', 'Peterson', 'Oakland'),
    (20006, 'Bianca', 'Sanchez', 'San Francisco'),
    (20007, 'James', 'Owen', 'Oakland'),
    (20008, 'Lisa', 'Smith', 'San Francisco'),
    (20009, 'Daniel', 'Yasukawa', 'Oakland'),
    (20010, 'Lauren', 'Pham', 'San Jose'),
    (20011, 'Juan', 'Diaz', 'Oakland'),
    (20012, 'Martha', 'Diaz', 'San Francisco'),
    (20013, 'Margret Shang', '', 'Oak Town'),
    (20014, 'Jeremy', 'Fernandez', 'Berkeley'),
    (20018, 'TIffany', 'Williams', 'Frisco'),
]

In [0]:
products = [
    (30001, 'T-Shirt', 12.98, 40004, 'Studio Warehouse'),
    (30002, 'Tooth Paste', 4.5, 40002, 'BioMed Inc'),
    (30003, 'Speakers', 56.99, 40005, 'CompTech, Inc'),
    (30004, 'Pants', 34.35, 40003, 'FashionRUs'),
    (30005, 'Microphone', 46.21, 40001, 'Music Vibrations, Inc'),
    (30006, 'Tie', 11.09, 40003, 'FashionRUs'),
    (30007, 'Chap Stick', 3.67, 40002, 'BioMed Inc'),
    (30008, 'Medicine', 23.96, 40002, 'BioMed Inc'),
    (30009, 'Headphones', 31.44, 40001, 'Music Vibrations, Inc'),
    (30010, 'Lotion', 16.77, 40002, 'BioMed Inc'),
    (30011, 'DVD Player', 149.99, 40005, 'CompTech, Inc'),
    (30012, 'Coat', 56.33, 40004, 'Studio Warehouse'),
    (40013, 'Black Pack', 'Twenty Two Dollars', 40006, 'WearUps Inc'),
    (40014, 'Auditing Services', 7000, 40007, 'DataQuery Inc'),
    ('12!d3&)dkc_+@ad', 'Car', '', '48350254-5qe', ''),
    (40015, 'Shirt', 180, 40004, 'Ropa Inc'),
]

In [0]:
orders = [
    (50001, 20005, 30005, 10001, '17-02-12'),
    (50002, 20003, 30008, 10011, '17-09-03'),
    (50003, 20010, 30001, 10009, '17-11-23'),
    (50004, 20004, 30008, 10010, '17-01-28'),
    (50005, 20008, 30012, 10002, '17-08-11'),
    (50006, 20011, 30011, 10001, '17-04-13'),
    (50007, 20012, 30002, 10007, '17-10-07'),
    (50008, 20002, 30001, 10004, '17-03-09'),
    (50009, 20001, 30006, 10009, '17-12-26'),
    (50010, 50008, 30007, 10008, '09-26'),
    (50017, '', 30096, 10012, '12-26'),
    (50018, 20022, 30007, 10123, '02-21-01'),
]



---



**Inserting values**

In [0]:
cur.executemany("INSERT INTO Employee VALUES (?,?,?,?,?)", employees)

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.executemany("INSERT INTO Supplier VALUES (?,?,?)", suppliers)

<sqlite3.Cursor at 0x7fef5925e3b0>

In [0]:
cur.executemany("INSERT INTO Sales VALUES (?,?,?,?,?)", sales)

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.executemany("INSERT INTO Customer VALUES (?,?,?,?)", customers)

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.executemany("INSERT INTO Product VALUES (?,?,?,?,?)", products)

<sqlite3.Cursor at 0x7fef5a93c340>

In [0]:
cur.executemany("INSERT INTO Orders VALUES (?,?,?,?,?)", orders)

<sqlite3.Cursor at 0x7fef5925e3b0>

In [0]:
con.commit()



---



# Load tables into Data Frame

In [0]:
import numpy as np
import pandas as pd

In [0]:
employee_df = pd.read_sql_query('SELECT * FROM Employee', con)

In [0]:
supplier_df = pd.read_sql_query('SELECT * FROM Supplier', con)

In [0]:
sales_df = pd.read_sql_query('SELECT * FROM Sales', con)

In [0]:
customer_df = pd.read_sql_query('SELECT * FROM Customer', con)

In [0]:
product_df = pd.read_sql_query('SELECT * FROM Product', con)

In [0]:
orders_df = pd.read_sql_query('SELECT * FROM Orders', con)



---



### Google Worksheets

In [0]:
!pip install --upgrade -q gspread

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [0]:
def get_df(file_name, location):
  worksheet = gc.open(file_name).get_worksheet(location)
  rows = worksheet.get_all_values()
  head = rows[0]
  data = rows[1:]
  df = pd.DataFrame.from_record(data, columns=head)
  return df



---



# Data Quality

In [0]:
import numpy as np
import pandas as pd

## Observing the Data

**Obesrvations**

* Employee_ID repeats, incorrect type

* Missing Last_Name and Department

* City spelling errors, etc

* Last_Name Capitalized and/or not starting with capital letter

* First_Name Repeats and wrong characters



---



In [0]:
employee_df

Unnamed: 0,Employee_ID,First_Name,Last_Name,Department,City
0,10001,Daniel,Olsong,Electronics,San Francisco
1,10002,Nick,Markmen,Clothes,San Franciso
2,10003,Maria,Gonzalez,Electronics,San Jose
3,10004,George,Wong,Clothes,San Franciso
4,10005,Stephanie,Williams,Clothes,Oakland
5,10006,Miguel,Alva,Clothes,Oakland
6,10007,Jessica,Collins,Cosmetics,San Franciso
7,10008,Ted,Anderson,Clothes,Oakland
8,10009,Victoria,Garcia,Clothes,San Jose
9,10010,Jasmine,Khan,Parmacy Health & Beauty,Oakland


In [0]:
customer_df

Unnamed: 0,Customer_ID,First_Name,Last_Name,City
0,20001,Jack,Ward,San Jose
1,20002,Steven,Martinez,San Francisco
2,20003,Jessica,Collins,San Jose
3,20004,Carie,Robinson,San Francisco
4,20005,Zack,Peterson,Oakland
5,20006,Bianca,Sanchez,San Francisco
6,20007,James,Owen,Oakland
7,20008,Lisa,Smith,San Francisco
8,20009,Daniel,Yasukawa,Oakland
9,20010,Lauren,Pham,San Jose


In [0]:
product_df

Unnamed: 0,Product_ID,Product_Name,Price,Supplier_ID,Supplier_Name
0,30001,T-Shirt,12.98,40004,Studio Warehouse
1,30002,Tooth Paste,4.5,40002,BioMed Inc
2,30003,Speakers,56.99,40005,"CompTech, Inc"
3,30004,Pants,34.35,40003,FashionRUs
4,30005,Microphone,46.21,40001,"Music Vibrations, Inc"
5,30006,Tie,11.09,40003,FashionRUs
6,30007,Chap Stick,3.67,40002,BioMed Inc
7,30008,Medicine,23.96,40002,BioMed Inc
8,30009,Headphones,31.44,40001,"Music Vibrations, Inc"
9,30010,Lotion,16.77,40002,BioMed Inc


In [0]:
supplier_df

Unnamed: 0,Supplier_ID,Name,City
0,40001,"Music Vibrations, Inc",Oakland
1,40002,BioMedInc,San Franciso
2,40003,FashionRUs,San Jose
3,40004,Studio Warehouse,San Franciso
4,40005,"CompTech, Inc",San Jose
5,400006,WearUps Inc,NY
6,40007,DataQuery Inc,SJ
7,40008,Ropa Inc,Mexico


In [0]:
orders_df

Unnamed: 0,Order_ID,Customer_ID,Product_ID,Employee_ID,Date
0,50001,20005.0,30005,10001,17-02-12
1,50002,20003.0,30008,10011,17-09-03
2,50003,20010.0,30001,10009,17-11-23
3,50004,20004.0,30008,10010,17-01-28
4,50005,20008.0,30012,10002,17-08-11
5,50006,20011.0,30011,10001,17-04-13
6,50007,20012.0,30002,10007,17-10-07
7,50008,20002.0,30001,10004,17-03-09
8,50009,20001.0,30006,10009,17-12-26
9,50010,50008.0,30007,10008,09-26




---



##Data Cleaning