# Jeff Pinegar
Project 2: ETL 
Due Dec. 23, 2022

### Extract and Transform MUDData Sales Data
---

In [1]:
# Import needed libraries
import pandas as pd     # if you get an error message module not found you need to add this to your environment.
import os               # this tells your project about the OS of your machine.  This will let you create machine independent notebooks
import numpy as np      # loads in the math tools
from datetime import datetime

---
### Extract

In [2]:
# Import data file
# My Jupyter notebook is in the current directory " . "
# There is a sub directory "Resources" and the file "IPC shipped from Germany by serial number 2022 (until 5.12.2022).csv" is located there.
sales_data = os.path.join('.','Resources', '133156280355595299.csv')            

Import_data_type = {'SN' : 'str'  ,  'date' : 'str'  ,  'orderkey' : 'str'  ,  'CMAT' : 'str'  ,  'CMAT_Disp' : 'str'  
                  ,  'CMAT_Mount' : 'str'  ,  'CMAT_CPU' : 'str'  ,  'CMAT_OS' : 'str'  ,  'CMAT_SW' : 'str'  ,  'CMAT_RAM' : 'str'  
                  ,  'CMAT_NVRAM' : 'str'  ,  'CMAT_Storage1' : 'str'  ,  'CMAT_Storage2' : 'str'  ,  'CMAT_RAID' : 'str'  
                  ,  'CMAT_DVD' : 'str'  ,  'CMAT_PCI' : 'str'  ,  'CMAT_PCIslot1' : 'str'  ,  'CMAT_PCIslot2' : 'str'  
                  ,  'CMAT_ExtFun' : 'str'  ,  'CMAT_miniPCI' : 'str'  ,  'CMAT_PS' : 'str'  ,  'article' : 'str'  ,  'nic' : 'str'  
                  ,  'superio' : 'str'  ,  'MAC1' : 'str'  ,  'MAC2' : 'str'  ,  'UUID' : 'str'  ,  'disk' : 'str'  ,  'location' : 'str' } 

# Opens the CVS file and read it in as a panda dataframe then print the head.
dfm_raw = pd.read_csv(sales_data, encoding="utf-8", dtype=Import_data_type)                    
dfm_raw.head(2)

Unnamed: 0,SN,article,orderkey,nic,superio,MAC1,MAC2,UUID,disk,CMAT,...,CMAT_PCI,CMAT_PCIslot1,CMAT_PCIslot2,CMAT_SW,CMAT_NVRAM,CMAT_ExtFun,CMAT_PS,CMAT_miniPCI,location,date
0,1234567879,1016240,1016240/A22/I46/R30/M72/OS71/S00/EF07/EF00,I210,UNKNOWN,A8741D4BAA21,A8741D4BAA22,9adca857-b628-47d7-8c75-72f10cdbf7c1,WARIS TS46EMM 32G Serial# D373080042,1016240,...,,,,S00,,EF00,,,Taiwan2,12/15/2022 14:56
1,2036471071,1105780,1105780/D30/A20/I47/R26/M86/OS71/S00/EF00/EF00,I210,UNKNOWN,A8741D8F53C9,A8741D8F53C8,0335aca4-dcd4-448b-9795-e7d0c5da6906,,1105780,...,,,,S00,,EF00,,,Bad Pyrmont,12/15/2022 9:45


---
### Transform

In [3]:
# Confirm the import worked as expected.
dfm_raw.shape

(103099, 29)

In [4]:
# Create a working df
dfm = dfm_raw

In [5]:
# reduce the data frame to the needed columns and update the column names
dfm = dfm[[ 'SN' , 'date' , 'orderkey' , 'CMAT' , 'CMAT_Disp' 
            , 'CMAT_Mount' , 'CMAT_CPU' , 'CMAT_OS' , 'CMAT_SW' 
            , 'CMAT_RAM' , 'CMAT_NVRAM' , 'CMAT_Storage1' , 'CMAT_Storage2' 
            , 'CMAT_RAID' , 'CMAT_DVD' , 'CMAT_PCI' , 'CMAT_PCIslot1' 
            , 'CMAT_PCIslot2' , 'CMAT_ExtFun' , 'CMAT_miniPCI' , 'CMAT_PS']]

# Rename the columns to be more helpful
dfm = dfm.rename(columns={ 'SN' : 'IPC_Serial_No'  ,  'date' : 'Mfg_Date'  ,  'orderkey' : 'Order_Key'  
                        ,  'CMAT' : 'Platform'  ,  'CMAT_Disp' : 'Display'  ,  'CMAT_Mount' : 'Mounting'  
                        ,  'CMAT_CPU' : 'CPU'  ,  'CMAT_OS' : 'OS'  ,  'CMAT_SW' : 'SW'  ,  'CMAT_RAM' : 'RAM'  
                        ,  'CMAT_NVRAM' : 'NVRAM'  ,  'CMAT_Storage1' : 'Mass_Store_1'  
                        ,  'CMAT_Storage2' : 'Mass_Store_2'  ,  'CMAT_RAID' : 'RAID'  ,  'CMAT_DVD' : 'DVD'  
                        ,  'CMAT_PCI' : 'PCI'  ,  'CMAT_PCIslot1' : 'PCI_Slot_1'  ,  'CMAT_PCIslot2' : 'PCI_Slot_2'  
                        ,  'CMAT_ExtFun' : 'Ext_Func'  ,  'CMAT_miniPCI' : 'MiniPCI'  ,  'CMAT_PS' : 'PS' })

In [6]:
# Remove any duplicate orders
dfm = dfm.drop_duplicates(keep='first')

dfm.head(2)

Unnamed: 0,IPC_Serial_No,Mfg_Date,Order_Key,Platform,Display,Mounting,CPU,OS,SW,RAM,...,Mass_Store_1,Mass_Store_2,RAID,DVD,PCI,PCI_Slot_1,PCI_Slot_2,Ext_Func,MiniPCI,PS
0,1234567879,12/15/2022 14:56,1016240/A22/I46/R30/M72/OS71/S00/EF07/EF00,1016240,,A22,I46,OS71,S00,R30,...,M72,,,,,,,EF00,,
1,2036471071,12/15/2022 9:45,1105780/D30/A20/I47/R26/M86/OS71/S00/EF00/EF00,1105780,D30,A20,I47,OS71,S00,R26,...,M86,,,,,,,EF00,,


#### Convert Data types

In [7]:
# Convert mfg date to datetime

# Define a funtion to change the format
def convertdate(date_in_dmyyyy_format):
    date_as_string= str(date_in_dmyyyy_format)
    newformat = datetime.strptime(date_as_string, "%m/%d/%Y %H:%M").strftime("%Y-%m-%d")
    return str(newformat)

#reformat the sell_date string to yyyy-mm-dd.
dfm['Mfg_Date']=dfm['Mfg_Date'].apply(convertdate) 

# Convert the date string to type datetime
dfm['Mfg_Date']=pd.to_datetime(dfm['Mfg_Date'])


In [8]:
type(dfm['Mfg_Date'][3])

pandas._libs.tslibs.timestamps.Timestamp

In [9]:
# Remove items mfg before 1/1/2021
dfm = dfm.loc[dfm['Mfg_Date']>='2021-01-01']

In [10]:
# Remove duplicate Serial numbers. Duplicates are caused by repairs and Reword
dfm = dfm.drop_duplicates(subset=['IPC_Serial_No'], keep='first')

In [11]:
# save the clean data frame
dfm_clean = dfm

#### Save the clean data as a CSV file

In [12]:
# Export as CSV file
file_one = os.path.join('.','Resources', 'Mfg_Data_Clean.csv')
dfm_clean.to_csv(file_one, index=False, header=True)                    # Writes out MyFileName.cvs into the subdirectory "Output"

---
## To load run:  Load_Postgres_database.ipynb
---