## Extracts the manufacturing failure data set from the cloud

In [18]:
# Imports 
import yaml
from sqlalchemy import create_engine
import pandas as pd 

In [19]:
# Task 2
# Step 3: function to parse a yaml file into a dictionary format 
yaml_file = 'credentials.yaml'

def yaml_to_dict(yaml_file):
        '''
        converts a YAML file into a python dictionary type
        '''
        with open(yaml_file) as file:
               return yaml.safe_load(file)

# Test with the "credentials.yaml" file               
credentials_dict = yaml_to_dict(yaml_file='credentials.yaml')
credentials_dict       

{'RDS_HOST': 'eda-projects.cq2e8zno855e.eu-west-1.rds.amazonaws.com',
 'RDS_PASSWORD': 'EDAprocessanalysis',
 'RDS_USER': 'manufacturinganalyst',
 'RDS_DATABASE': 'process_data',
 'RDS_PORT': 5432,
 'DATABASE_TYPE': 'postgresql',
 'DBAPI': 'psycopg2'}

In [20]:
# Step 4 & 5 
class RDSDatabaseConnector(dict):
        '''
        This Class is used to connect to the AWS RDS Database
        '''
        def __init__(self, dict):
              self.dict = dict
              
      # define a function 
        def initialise_SQL_engine(self):
              '''
              Initalises the connection using the relevant credentials
              '''
              DATABASE_TYPE = self.dict['DATABASE_TYPE']
              DBAPI = self.dict['DBAPI']
              RDS_USER = self.dict['RDS_USER']
              RDS_PASSWORD = self.dict['RDS_PASSWORD']
              RDS_HOST = self.dict['RDS_HOST']
              RDS_PORT = self.dict['RDS_PORT']
              RDS_DATABASE = self.dict['RDS_DATABASE']

              engine = create_engine(f"{DATABASE_TYPE}+{DBAPI}://{RDS_USER}:{RDS_PASSWORD}@{RDS_HOST}:{RDS_PORT}/{RDS_DATABASE}")
              engine.execution_options(isolation_level='AUTOCOMMIT').connect()

              return engine

In [21]:
# Step 6 - extract data from RDS and return it as a pandas dataframe 
def extract_data_as_pandas_df(table_name, engine):
    '''
    Extracts data as a pandas DataFrame 
    '''
    df = pd.read_sql_table(table_name, engine)
    return df

# Step 7 - export the data as a csv
def export_data_as_csv(data, file_name):
    '''
    Exports data as .csv file
    '''
    data.to_csv(f"{file_name}.csv")
#failure_data.to_csv('failure_data.csv')

In [22]:
# Testing the Class 
database_1 = RDSDatabaseConnector(credentials_dict).initialise_SQL_engine() 
database_1

ImportError: dlopen(/Users/manishkhurmi/anaconda3/envs/manufacturing_eda/lib/python3.12/site-packages/psycopg2/_psycopg.cpython-312-darwin.so, 2): Library not loaded: @rpath/libpq.5.dylib
  Referenced from: /Users/manishkhurmi/anaconda3/envs/manufacturing_eda/lib/python3.12/site-packages/psycopg2/_psycopg.cpython-312-darwin.so
  Reason: image not found

In [6]:
# Testing extract_data_as_pandas_df()
df_1 = extract_data_as_pandas_df(table_name='failure_data', engine = database_1).head(1)

In [7]:
# Testing export_data_as_csv()
export_data_as_csv(df_1, 'test')

In [None]:
# Inital glance at the data
failure_data = pd.read_csv('failure_data.csv')
failure_data.head(3)


In [None]:
failure_data.describe()

In [None]:
failure_data
percentage_of_null = failure_data.isnull().sum() / len(failure_data) * 100  
percentage_of_null[percentage_of_null > 0]

#### Machine failure dataset data dictionary

- **UID**: Unique identifier of the machining session
- **product_ID**: Product specific serial number
- **Type**: Quality of the product being created L, M, or H, for low, medium and high quality products
- **air temperature [K]**: Average air temperature in the room during the process in Kelvin
- **process temperature [K]**:  Average air temperature the machine was operating at during production in Kelvin
- **Rotational speed [rpm]**: Average revolutions per minute the tool was operating at
- **Torque [Nm]**: Torque generated by the tool in Newton-meters
- **Tool wear [min]**: The current minutes of wear on the tool in minutes. H, M and L product manufacturing cause 5/3/2 minutes of tool wear. 
- **machine failure**: Label that indicates, whether the machine failed this particular run
- **TWF (tool wear failure)**: Failure in the process due to the tool wearing out
- **head dissipation failure (HDF)**: Lack of heat dissipation caused the process failure
- **power failure (PWF)**: Failure in the process due to lack of power from the tool to complete the process
- **overstrain failure (OSF)**: Failure due to the tool overstraining during the process
- **random failures (RNF)**: Failures in the process which couldn't be categorised

#### Initial thoughts on the data 
- The dependant variable is `machine failure`
- The lower the qualiity of the product, the less the wear on the tools
- TWF, HDF and OSF are likely to be correlated, potential endogeniety problem.