In [14]:
import pandas as pd
from tqdm import tqdm
import os, sys
import warnings
import seaborn as sns 
%matplotlib inline

warnings.filterwarnings('ignore')

dataFolder = 'data/'
dataList = [dataFolder + f for f in os.listdir(dataFolder) if f.endswith('.csv')]


def read_csv(file: str) -> pd.DataFrame:
    return pd.read_csv(file)

def pre_processing(df:pd.DataFrame, drop_col:list=None, one_hot_col:list=None, date_col:list=None) -> pd.DataFrame:
    
    # convert nan to 0
    df = df.fillna(0)
    
    # drop id column
    if drop_col is not None:
        for col in drop_col:
            df = df.drop(col, axis=1)
            
    # one-hot encoding 
    if one_hot_col is not None:
        for col in one_hot_col:
            df[col] = df[col].astype('category').cat.codes
    
    # date convert
    if date_col is not None:
        for date in date_col:
            df[date] = pd.to_datetime(df[date])
            
    return df

def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

### Pre-processing

* read file 
* EDA with sweetviz or piperider
* check for missing values and outliers


#### 讀資料

In [2]:
reportDf = read_csv(dataList[0])
birthDf = read_csv(dataList[1])
submissionDf = read_csv(dataList[2])
breedDf = read_csv(dataList[3])

specDf = read_csv(dataList[4])
# 因為 data 裡有相同、但大小寫不同的欄位，所以先轉成大寫
specDf['typeOfSituation'] = specDf['typeOfSituation'].str.upper()


#### 分析資料中的缺失值，再決定要不要刪除 / 補值 

In [22]:
for df in [reportDf, birthDf, submissionDf, breedDf, specDf]:
    print(get_df_name(df))
    print(df.shape)
    # 算出每個欄位的缺失值比例
    print(f"{df.isnull().sum() / len(df) * 100}\n")

reportDf
(37517, 21)
id                       0.000000
year                     0.000000
month                    0.000000
dairyFarm                0.000000
numOfCow                 0.000000
numOfCowFatherSemen      2.334941
numOfCowMotherSemen      9.553003
birthday                 0.000000
parity                   0.000000
dayOfLactation           0.002665
milkVolume              11.362849
lastDateOfBirth          0.002665
dayOfSampling            0.000000
ageOfTheMoon             0.000000
dateOfTest               0.000000
lastDateOfBreeding       4.094144
semenOfLastBreeding      4.094144
countOfBreeding          0.000000
lastDateOfChildbirth    46.432284
firstDateOfBreeding      3.814271
semenOfFirstBreeding     3.814271
dtype: float64

birthDf
(3761, 13)
numOfCow                    0.000000
dateOfChildbirth            0.000000
dateOfDryMilk              43.525658
firstNumOfCalf             77.293273
secondNumOfCalf            99.787291
numOfmonCow                81.122042
dateOfLo

- 我們從上面的分析得知，有些資料是有缺失值的，還有一些欄位是沒有意義的，所以我們要先處理這些資料


- 刪除缺失值的欄位
    * reportDf -> lastDateOfChildbirth, id, dateOfTest
    * birthDf -> 'secondNumOfCalf', 'countOfwomb', 'dateOfLogin', 'firstNumOfCalf', 'sizeOfCalf', 'genderOfCalf'
    * breedDf -> 'dateOfLogin', 'pregnancyTest', 'typeOfSemen'
    * specDf -> 'dateOfLogin', 'comment'

#### One-Hot Encoding map

```

preReportDf

- dairyFarm {A: 0, B: 1, C: 2}

---

preBirthDf

- genderOfCalf {公：1，母：2, 未知：0}
- dairyFarm {A: 0 , B: 1, C: 2}
- sizeOfCalf {S：3, M：2, L：1, 未知： 0}

---

preSpecDf

- typeOfSituation { C:2, D:3, N:4 ,1:0, 2:1}
- dairyFarm {A: 0 , B: 1, C: 2}

---

preBreedDf

- dairyFarm {A: 0 , B: 1, C: 2}

```


In [3]:
preReportDf = pre_processing(df=reportDf, drop_col=['id', 'dateOfTest', 'lastDateOfChildbirth'], one_hot_col=['dairyFarm', 'numOfCowFatherSemen','semenOfLastBreeding', 'semenOfFirstBreeding' ], \
    date_col=['birthday', 'lastDateOfBirth', 'dayOfSampling', 'lastDateOfBreeding', 'firstDateOfBreeding'])  
    
preBirthDf = pre_processing(df=birthDf, drop_col=['secondNumOfCalf', 'countOfwomb', 'dateOfLogin', 'firstNumOfCalf', 'sizeOfCalf', 'genderOfCalf', 'dateOfDryMilk'], one_hot_col=['dairyFarm'], \
    date_col=['dateOfChildbirth'])

preSpecDf = pre_processing(df=specDf, drop_col=['dateOfLogin', 'comment'], one_hot_col=['dairyFarm', 'typeOfSituation', 'codeOfSituation'])

preBreedDf = pre_processing(df=breedDf, drop_col=['dateOfLogin', 'pregnancyTest', 'typeOfSemen'], one_hot_col=['dairyFarm','semenOfBreeding'],\
    date_col=['dateOfBreeding'])

for df in tqdm([preReportDf, preBirthDf, preSpecDf, preBreedDf]):
    print(f"The size of {get_df_name(df)} : {df.shape}")

#### print desecribe, corr, heatmap

In [13]:
preReportDf.describe()

Unnamed: 0,year,month,dairyFarm,numOfCow,numOfCowFatherSemen,parity,dayOfLactation,milkVolume,ageOfTheMoon,semenOfLastBreeding,countOfBreeding,semenOfFirstBreeding
count,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0,37517.0
mean,2016.126529,6.202068,0.914279,23452950.0,174.850068,1.959618,204.892049,21.265523,47.462137,88.645014,1.961271,93.009702
std,1.905908,3.397154,0.635691,39298610.0,110.493598,1.222761,149.728865,11.212849,19.257433,61.036404,2.121853,63.052802
min,2013.0,1.0,0.0,52612.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,3.0,1.0,1181884.0,69.0,1.0,88.0,15.0,33.0,33.0,0.0,37.0
50%,2016.0,6.0,1.0,3126107.0,178.0,2.0,180.0,22.0,43.0,86.0,1.0,90.0
75%,2018.0,9.0,1.0,10837490.0,280.0,2.0,286.0,29.0,56.0,144.0,3.0,152.0
max,2019.0,12.0,2.0,99183210.0,343.0,11.0,1789.0,61.0,160.0,201.0,16.0,216.0
