# Jeff Pinegar
Project 2: ETL 
Due Dec. 23, 2022

### Extract and Transform MUDData Sales Data
---

In [None]:
# Import needed libraries
import pandas as pd     # if you get an error message module not found you need to add this to your environment.
import os               # this tells your project about the OS of your machine.  This will let you create machine independent notebooks
import numpy as np      # loads in the math tools
from datetime import datetime

---
### Extract

In [None]:
# Import data file
# My Jupyter notebook is in the current directory " . "
# There is a sub directory "Resources" and the file "IPC shipped from Germany by serial number 2022 (until 5.12.2022).csv" is located there.
sales_data = os.path.join('.','Resources', '133156280355595299.csv')            

# Opens the CVS file and read it in as a panda dataframe then print the head.
dfm_raw = pd.read_csv(sales_data, encoding="utf-8")                    
dfm_raw.head(2)

---
### Transform

In [None]:
# Confirm the import worked as expected.
dfm_raw.shape

In [None]:
# Create a working df
dfm = dfm_raw

In [None]:
# reduce the data frame to the needed columns and update the column names
dfm = dfm[[ 'SN' , 'date' , 'orderkey' , 'CMAT' , 'CMAT_Disp' 
            , 'CMAT_Mount' , 'CMAT_CPU' , 'CMAT_OS' , 'CMAT_SW' 
            , 'CMAT_RAM' , 'CMAT_NVRAM' , 'CMAT_Storage1' , 'CMAT_Storage2' 
            , 'CMAT_RAID' , 'CMAT_DVD' , 'CMAT_PCI' , 'CMAT_PCIslot1' 
            , 'CMAT_PCIslot2' , 'CMAT_ExtFun' , 'CMAT_miniPCI' , 'CMAT_PS']]

# Rename the columns to be more helpful
dfm = dfm.rename(columns={ 'SN' : 'IPC_Serial_No'  ,  'date' : 'Mfg_Date'  ,  'orderkey' : 'Order_Key'  
                        ,  'CMAT' : 'Platform'  ,  'CMAT_Disp' : 'Display'  ,  'CMAT_Mount' : 'Mounting'  
                        ,  'CMAT_CPU' : 'CPU'  ,  'CMAT_OS' : 'OS'  ,  'CMAT_SW' : 'SW'  ,  'CMAT_RAM' : 'RAM'  
                        ,  'CMAT_NVRAM' : 'NVRAM'  ,  'CMAT_Storage1' : 'Mass_Store_1'  
                        ,  'CMAT_Storage2' : 'Mass_Store_2'  ,  'CMAT_RAID' : 'RAID'  ,  'CMAT_DVD' : 'DVD'  
                        ,  'CMAT_PCI' : 'PCI'  ,  'CMAT_PCIslot1' : 'PCI_Slot_1'  ,  'CMAT_PCIslot2' : 'PCI_Slot_2'  
                        ,  'CMAT_ExtFun' : 'Ext_Func'  ,  'CMAT_miniPCI' : 'MiniPCI'  ,  'CMAT_PS' : 'PS' })

In [None]:
# Remove any duplicate orders
dfm = dfm.drop_duplicates(keep='first')

dfm.head(2)

#### Convert Data types

In [None]:
# Convert mfg date to datetime

# Define a funtion to change the format
def convertdate(date_in_dmyyyy_format):
    date_as_string= str(date_in_dmyyyy_format)
    newformat = datetime.strptime(date_as_string, "%m/%d/%Y %H:%M").strftime("%Y-%m-%d")
    return str(newformat)

#reformat the sell_date string to yyyy-mm-dd.
dfm['Mfg_Date']=dfm['Mfg_Date'].apply(convertdate) 

# Convert the date string to type datetime
dfm['Mfg_Date']=pd.to_datetime(dfm['Mfg_Date'])


In [None]:
# Remove items mfg before 1/1/2021
dfm = dfm.loc[dfm['Mfg_Date']>='2021-01-01']

In [None]:
# Remove duplicate Serial numbers. Duplicates are caused by repairs and Reword
dfm = dfm.drop_duplicates(subset=['IPC_Serial_No'], keep='first')

In [None]:
dfm_clean = dfm

## Save the clean data as a CSV file

In [None]:
# Export as CSV file
file_one = os.path.join('.','Resources', 'Mfg_Data_Clean.csv')
dfm_clean.to_csv(file_one, index=False, header=True)                    # Writes out MyFileName.cvs into the subdirectory "Output"