In [1]:
from IPython.display import Image

# <a id="1.Summary"> 1.Summary</a>

I have attached a spreadsheet containing five years of production and weather data for our Middlesex, NJ and Bethlehem, PA facilities. When looking at the production data if the “BatchNumber” listed starts with PASD it was done in Bethlehem and if it starts with NJSD it was done in Middlesex. I included some info that we have on flowability and hygroscopicity but it’s not empirical data; mostly a judgement call made for “reasons”. Weather data was downloaded from openweathermap.org, temperatures are in °F. Let me know if you need any clarification on the data provided.


More specifically, given the dependence of their drying products and services on weather
conditions, the company is interested in using weather data to forecast the processing times for
different batches of SKUs. 

The team is expected to:

• Develop a database with manufacturing data that will be used to develop statistical
models

• Develop, manage and maintain statistical models, including but not limited to:

o Forecasting of processing times based on weather data

o Forecasting downtime and preventive maintenance issues

<br>

<br>

<br>

# <a id="2.Table of Contents">2.Tabole of Contents</a>
<a href="#1.Summary">Click this Link back to Top</a>

<ol>
    <li><a href="#1.Summary">Summary</a></li>
    <li><a href="#2.Tabole of Contents">Tabole of Contents</a></li>
    <li><a href="#3.Preprocess">Preprocess</a>
    <ul>
        <li><a href="#3.1 Hyperparameter">3.1 Hyperparameter</a></li>
        <li><a href="#3.2 Import Data">3.2 Import Data</a></li>
        <li><a href="#3.3 Clean Data">3.3 Clean Data</a></li> 
        <li><a href="#3.4 Tokenize">3.4 Tokenize</a></li> 
        <li><a href="#3.5 Label Processing">3.5 Label Processing</a></li>  
        <li><a href="#3.6 EDA">3.6 EDA</a></li>  
    </ul>
    </li>
    <li><a href="#4.Word2Vect">Word2Vect</a>
    <ul>
        <li><del><a href="#TFIDF">4.1 TFIDF</a></del></li>
        <li><del><a href="#Embedding">4.2 Embedding</a></del></li>
        <li><a href="#Glove Vector">4.3 Glove Vector</a></li>
        <li><a href="#BERT Vector">4.4 BERT Vector</a></li> 
        <li><a href="#Split and Compile">4.5 Split and Compile</a></li> 
    </ul>
    </li>
    <li><a href="#5.Classify">Classify</a></li>
    <li><a href="#6.Models">Models</a>
    <ul>
        <li><a href="#6.1 DNN">6.1 DNN</a></li>
        <li><a href="#6.2 CNN">6.2 CNN</a></li>
        <li><a href="#6.3 RNN">6.3 RNN</a></li>
        <li><a href="#6.4 BERT">6.4 BERT</a></li>
        <li><a href="#6.5 Seq2Seq">6.5 Seq2Seq</a></li>
        <li><a href="#6.9 Save">6.9 Save</a></li>
    </ul>
    </li>
    <li><a href="#7.Analysis and Plot">Analysis and Plot</a></li>
    <ul>
        <li><a href="#7.1 Analysis">7.1 Analysis</a></li>
        <li><a href="7.2 Spearmanr">7.2 Spearmanr</a></li>
    </ul>
    <li><a href="#8.Main Function">Main Function</a></li>
    <li><a href="#9.Test Code">Test Code</a></li>
</ol>

# <a id="3.Preprocess">3.Preprocess</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

1. transform data column into identical format
2. split data into two station part: NJ and PA
3. 

## <a id="3.1 Hyperparameter">3.1 Hyperparameter</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

In [50]:
class HyperParamters(object):
    """
    This class will be used to transmit hyperparameters between class.parameters
    Most of class can inherit this class and its hyperparameters

    ########################__Rules__##################################
    class file name = class + _ + number of sequence + function name
    class name = ClassType (Camel-Case)
    function name = function_name
    variable name = attribute_type_describe (Hungarian notation) # sometime I don't use attribute
    constant = UPPERCASE
    ###########################################################################


    ########################__Notation__############################################
    1. We might need split data into two place NJ and PA, but we also can try to merge two location into one,
    because they are not far away
    2. clean data, transform date format, join by date, Coeffience anaylsis
    3. Check kaggle format
    4. Check journals
    5. Traditional way is to find relationship between total running time with other weather features
    6. But we need to consider equipment service lift or depreciation will affect running time
    7. Sometimes, we don't use running time itself, we can use the statiscal version of this data
    For example, runing time - avg , variance of each running time data point
    8.
    #############################################################################

    1.change import data path
    """

    def __init__(self):
        """:arg


        """
        self.TEST = 1

        # you can change this root path in this class and import_data() function will search from this root dictionary
        self.ROOTPATH = 'D:\\OneDrive\\03_Academic\\23_Github\\20_Stevens\\66-MGT-809\\03_data'


<br>

<br>

<br>

## <a id="3.2 Import Data">3.2 Import Data</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

In [54]:
# from class_31_hyperparameters import HyperParamters

# read xlxs file
import pandas as pd
# record running data
from time import time
# join directory path
import os




class ImportData(HyperParamters):
    """



    """
    def __init__(self):
        # inhereit from HyperParameter
        HyperParamters.__init__(self)


    def import_data(self):
        """:arg

        Returns:
        ----------
        df_product:DataFrame
            real production data including running time
        df_nj_weather:DataFrame
            Middlesex, NJ factory weather data from openweather.org
        df_pa_weather:DataFrame


        """
        print("*" * 50, "Start import_data()", "*" * 50)
        start_time = time()
        
        # create excel full paht directory
        excel_product = os.path.join(self.ROOTPATH, '017_20160101_20201231_ProductionData.xlsx')
        # first sheet is real production data
        df_product = pd.read_excel(excel_product, sheet_name = '20160101_20201231_ProductionDat')
        # second sheet is NJ weather data
        df_nj_weather = pd.read_excel(excel_product, sheet_name = 'MiddlesexWeather')
        # third sheet is PA weather data
        df_pa_weather = pd.read_excel(excel_product, sheet_name = 'BethlehemWeather')
        
        cost_time = round((time() - start_time), 4)
        print("*" * 40, "End import_data() with {} second".format(cost_time), "*" * 40, end='\n\n')

        return df_product, df_nj_weather, df_pa_weather



# <a id="4.EDA">4.EDA</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

In [3]:
import pandas as pd

In [6]:
df_1 = pd.read_excel('D:\\03_data\\017_20160101_20201231_ProductionData.xlsx', sheet_name='20160101_20201231_ProductionDat')

In [7]:
df_1.shape

(9970, 16)

In [11]:
df_1.columns

Index(['StartDate', 'BatchNumber', 'CustItem', 'Dryer', 'Flow',
       'Hygroscopicity', 'Bulk Density', 'Moisture Target', 'ScheduledDryQty',
       'ActualDryQty', 'YieldPercentage', 'Rate', 'DryingTime_Hrs',
       'CleanTime_Hrs', 'DownTime_Hrs', 'TotalRunTime_Hrs'],
      dtype='object')

In [13]:
df_1.head()

Unnamed: 0,StartDate,BatchNumber,CustItem,Dryer,Flow,Hygroscopicity,Bulk Density,Moisture Target,ScheduledDryQty,ActualDryQty,YieldPercentage,Rate,DryingTime_Hrs,CleanTime_Hrs,DownTime_Hrs,TotalRunTime_Hrs
0,2016-01-02 08:54:58.000,NJSD313825,10013-0000,Dryer 04,Good,Medium,Target = 0.30 g/mL,2 to 4% Max,333.7357,352.2,105.5,146.75,2.4,5.1,0.0,7.5
1,2016-01-02 11:06:57.863,NJSD312807,10558-0000,Dryer 03,Good,Medium,Record,10% Max.,4270.412,3935.3999,92.2,468.499988,8.4,0.2,4.2,12.8
2,2016-01-02 21:40:51.470,NJSD313826,10447-0000,Dryer 04,Good,Medium,,3% Max,1353.0969,1195.9,88.4,123.28866,9.7,5.7,0.9,16.2
3,2016-01-02 22:45:11.673,NJSD312677,24164-0000,Dryer 02,Good,Medium,,5% Max,19549.0975,18185.1,93.0,335.51845,54.2,11.1,7.6,72.8
4,2016-01-03 00:10:32.910,NJSD314508,10558-0000,Dryer 03,Good,Medium,Record,10% Max.,4282.172,3373.1999,78.8,411.365841,8.2,0.1,3.5,11.8


In [31]:
# we have 11 dryers
df_1.groupby(['Dryer']).count()

Unnamed: 0_level_0,StartDate,BatchNumber,CustItem,Flow,Hygroscopicity,Bulk Density,Moisture Target,ScheduledDryQty,ActualDryQty,YieldPercentage,Rate,DryingTime_Hrs,CleanTime_Hrs,DownTime_Hrs,TotalRunTime_Hrs
Dryer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Dryer 01,2203,2203,2203,2202,2202,426,2202,2203,2203,2203,2203,2203,2203,2203,2203
Dryer 02,1228,1228,1228,1226,1226,353,1227,1228,1228,1228,1228,1228,1228,1228,1228
Dryer 03,1318,1318,1318,1317,1317,606,1317,1318,1318,1318,1317,1318,1318,1318,1318
Dryer 04,1468,1468,1468,1468,1468,570,1468,1468,1468,1468,1468,1468,1468,1468,1468
Dryer 06,875,875,875,859,859,328,867,874,871,873,869,875,875,875,875
Dryer 07,797,797,797,797,797,604,797,797,796,796,792,797,797,797,797
Dryer 08,720,720,719,719,719,476,719,720,718,718,717,720,720,720,720
Dryer 09,546,546,546,545,545,140,543,546,542,543,536,546,546,546,546
Dryer 10,793,793,793,791,791,344,791,793,791,792,789,793,793,793,793
Dryer 11,22,22,22,22,22,12,22,22,21,22,21,22,22,22,22


In [32]:
# two status of flow: good and flow, and data is imbalance, good is more than poor
df_1.groupby(['Flow']).count()

Unnamed: 0_level_0,StartDate,BatchNumber,CustItem,Dryer,Hygroscopicity,Bulk Density,Moisture Target,ScheduledDryQty,ActualDryQty,YieldPercentage,Rate,DryingTime_Hrs,CleanTime_Hrs,DownTime_Hrs,TotalRunTime_Hrs
Flow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Good,9503,9503,9502,9503,9498,3676,9499,9502,9492,9496,9480,9503,9503,9503,9503
Poor,443,443,443,443,443,179,443,443,440,441,440,443,443,443,443


In [26]:
# also have 
df_1.groupby(['Hygroscopicity']).count()

Unnamed: 0_level_0,StartDate,BatchNumber,CustItem,Dryer,Flow,Bulk Density,Moisture Target,ScheduledDryQty,ActualDryQty,YieldPercentage,Rate,DryingTime_Hrs,CleanTime_Hrs,DownTime_Hrs,TotalRunTime_Hrs
Hygroscopicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
High,496,496,496,496,496,191,496,496,495,495,495,496,496,496,496
Low,121,121,121,121,121,63,121,121,121,121,121,121,121,121,121
Medium,9322,9322,9321,9322,9318,3601,9318,9321,9309,9314,9298,9322,9322,9322,9322
,7,7,7,7,6,0,7,7,7,7,7,7,7,7,7


In [15]:
df_2 = pd.read_excel('D:\\03_data\\017_20160101_20201231_ProductionData.xlsx', sheet_name='MiddlesexWeather')

In [16]:
df_3 = pd.read_excel('D:\\03_data\\017_20160101_20201231_ProductionData.xlsx', sheet_name='BethlehemWeather')

In [27]:
df_2.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'feels_like', 'temp_min', 'temp_max', 'pressure', 'sea_level',
       'grnd_level', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h',
       'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all', 'weather_id',
       'weather_main', 'weather_description', 'weather_icon'],
      dtype='object')

In [18]:
df_2.shape

(46646, 25)

In [19]:
df_3.shape

(45592, 25)

In [29]:
df_2.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1451606400,2016-01-01 00:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.93,37.06,42.8,46.3,...,280,,,,,75,803,Clouds,broken clouds,04n
1,1451610000,2016-01-01 01:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.65,38.1,42.8,46.02,...,0,,,,,90,804,Clouds,overcast clouds,04n
2,1451613600,2016-01-01 02:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.27,36.12,42.8,45.11,...,300,,,,,90,804,Clouds,overcast clouds,04n
3,1451617200,2016-01-01 03:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,42.62,36.14,42.08,44.2,...,0,,,,,90,804,Clouds,overcast clouds,04n
4,1451620800,2016-01-01 04:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,42.08,35.42,41.0,43.56,...,290,,,,,90,804,Clouds,overcast clouds,04n


# <a id="8.Main Function">8.Main Function</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

In [53]:
# from class_31_hyperparameters import HyperParamters
# from class_32_import_data import ImportData


def main():
    """
    We use this function to call process one by one.
    """

    # *******************3.Preprocess**************************************
    # ***********************import******************************
#     class_import = ImportData()
#     df_product, df_nj_weather, df_pa_weather = class_import.import_data()


    return df_product, df_nj_weather, df_pa_weather

if __name__=="__main__":
    """:arg
    
    """
    (df_product, df_nj_weather, df_pa_weather) = main()
    print("OVER")

************************************************** Start import_data() **************************************************
OVER


# <a id="9.Test Code">9.Test Code</a>
<a href="#2.Table of Contents">Click this Link back to Top</a>

1. We might need split data into two place NJ and PA, but we also can try to merge two location into one, because they are not far away
2. clean data, transform date format, join by date, Coeffience anaylsis
3. Check kaggle format
4. Check journals 

In [56]:
df_product.head(5)

Unnamed: 0,StartDate,BatchNumber,CustItem,Dryer,Flow,Hygroscopicity,Bulk Density,Moisture Target,ScheduledDryQty,ActualDryQty,YieldPercentage,Rate,DryingTime_Hrs,CleanTime_Hrs,DownTime_Hrs,TotalRunTime_Hrs
0,2016-01-02 08:54:58.000,NJSD313825,10013-0000,Dryer 04,Good,Medium,Target = 0.30 g/mL,2 to 4% Max,333.7357,352.2,105.5,146.75,2.4,5.1,0.0,7.5
1,2016-01-02 11:06:57.863,NJSD312807,10558-0000,Dryer 03,Good,Medium,Record,10% Max.,4270.412,3935.3999,92.2,468.499988,8.4,0.2,4.2,12.8
2,2016-01-02 21:40:51.470,NJSD313826,10447-0000,Dryer 04,Good,Medium,,3% Max,1353.0969,1195.9,88.4,123.28866,9.7,5.7,0.9,16.2
3,2016-01-02 22:45:11.673,NJSD312677,24164-0000,Dryer 02,Good,Medium,,5% Max,19549.0975,18185.1,93.0,335.51845,54.2,11.1,7.6,72.8
4,2016-01-03 00:10:32.910,NJSD314508,10558-0000,Dryer 03,Good,Medium,Record,10% Max.,4282.172,3373.1999,78.8,411.365841,8.2,0.1,3.5,11.8


In [57]:
df_nj_weather.head(3)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1451606400,2016-01-01 00:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.93,37.06,42.8,46.3,...,280,,,,,75,803,Clouds,broken clouds,04n
1,1451610000,2016-01-01 01:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.65,38.1,42.8,46.02,...,0,,,,,90,804,Clouds,overcast clouds,04n
2,1451613600,2016-01-01 02:00:00 +0000 UTC,-18000,Middlesex,40.572603,-74.492654,43.27,36.12,42.8,45.11,...,300,,,,,90,804,Clouds,overcast clouds,04n


In [58]:
df_pa_weather.head(2)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1451606400,2016-01-01 00:00:00 +0000 UTC,-18000,Bethlehem,40.625932,-75.370458,41.43,32.81,39.92,43.08,...,290,,,,,90,804,Clouds,overcast clouds,04n
1,1451610000,2016-01-01 01:00:00 +0000 UTC,-18000,Bethlehem,40.625932,-75.370458,40.84,32.77,39.2,42.28,...,280,,,,,90,804,Clouds,overcast clouds,04n
