In [1]:
# pip install tensorflow pyod umap-learn

In [67]:
# basic random seed
import os 
import random
import numpy as np 

EPOCHS = 1000
LR = 1e-1
BATCH = 32
device = 'cuda'
DEFAULT_RANDOM_SEED = 42

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# tensorflow random seed 
import tensorflow as tf 
def seedTF(seed=DEFAULT_RANDOM_SEED):
    tf.random.set_seed(seed)
    
# torch random seed
import torch
def seedTorch(seed=DEFAULT_RANDOM_SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + tensorflow + torch 
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTF(seed)
    seedTorch(seed)
seedEverything(DEFAULT_RANDOM_SEED)

In [68]:
torch.cuda.is_available()
torch.cuda.empty_cache()

In [69]:
torch.cuda.is_available()

True

In [70]:
torch.cuda.device_count()

1

In [71]:
torch.cuda.get_device_name(0)

'NVIDIA A100 80GB PCIe MIG 3g.40gb'

In [72]:
import pandas as pd

In [73]:
train = pd.read_csv('./train_data.csv').drop(['out_pressure'],axis=1)
test = pd.read_csv('./test_data.csv').drop(['out_pressure'],axis=1)

In [74]:
train

Unnamed: 0,air_inflow,air_end_temp,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.00,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,38.40,3142.0,74.91,3.75,0
2,1.91,45.29,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,30.63,2506.0,67.84,3.39,0
4,1.90,45.21,24.65,2017.0,62.41,3.12,0
...,...,...,...,...,...,...,...
2458,2.28,50.20,29.53,2416.0,66.84,3.34,7
2459,2.04,46.94,26.34,2155.0,63.94,3.20,7
2460,1.19,35.74,15.39,1259.0,53.99,2.70,7
2461,1.21,36.00,15.64,1280.0,54.22,2.71,7


In [75]:
train.loc[train['type']==0].describe()

Unnamed: 0,air_inflow,air_end_temp,motor_current,motor_rpm,motor_temp,motor_vibe,type
count,432.0,432.0,432.0,432.0,432.0,432.0,432.0
mean,2.288264,50.286481,29.612616,2422.831019,66.920417,3.345926,0.0
std,0.651091,8.615564,8.424383,689.262891,7.658456,0.382908,0.0
min,1.14,35.09,14.75,1207.0,53.41,2.67,0.0
25%,1.735,42.96,22.4525,1837.0,60.415,3.0175,0.0
50%,2.275,50.135,29.46,2410.5,66.78,3.34,0.0
75%,2.86,57.895,37.055,3031.5,73.685,3.6825,0.0
max,3.37,64.54,43.55,3563.0,79.59,3.98,0.0


In [76]:
test

Unnamed: 0,air_inflow,air_end_temp,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,2.51,53.28,32.54,2662.0,69.58,3.48,0
1,2.66,55.24,34.45,2819.0,71.32,3.57,0
2,1.72,42.74,22.23,1819.0,60.21,3.01,0
3,2.20,49.15,28.50,2332.0,65.91,3.30,0
4,2.06,47.28,26.67,2182.0,64.24,3.21,0
...,...,...,...,...,...,...,...
7384,2.12,48.08,27.45,2246.0,64.96,3.25,7
7385,1.48,39.63,19.19,1570.0,57.44,2.87,7
7386,1.56,40.61,20.15,1649.0,58.32,2.92,7
7387,1.59,40.99,20.52,1679.0,58.66,2.93,7


In [77]:
test.describe()

Unnamed: 0,air_inflow,air_end_temp,motor_current,motor_rpm,motor_temp,motor_vibe,type
count,7389.0,7389.0,7389.0,7389.0,7389.0,7389.0,7389.0
mean,2.103129,49.042735,27.417935,2373.820003,65.771623,3.345523,2.957369
std,1.163998,8.686208,13.234089,687.737182,7.741473,0.644199,2.229811
min,0.34,32.08,5.37,1200.0,50.42,1.97,0.0
25%,1.28,41.51,17.2,1774.0,59.11,2.89,1.0
50%,1.94,48.76,26.17,2355.0,65.52,3.26,3.0
75%,2.77,56.58,35.86,2972.0,72.44,3.73,5.0
max,6.24,65.54,74.0,3564.0,82.39,6.26,7.0


In [78]:
train_cols = train.columns

In [79]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import umap

def preprocess(df):
    df['sum_temp'] = df['air_end_temp'] + df['motor_temp']
    df['sum'] = df['air_inflow'] + df['motor_current']
    df['inflow/air_end_temp'] = df['air_inflow'] / df['air_end_temp']
    
    df.loc[df['type'].isin([0, 4, 5, 6, 7]), 'hp'] = 30
    df.loc[df['type']==1, 'hp'] = 20
    df.loc[df['type']==2, 'hp'] = 10
    df.loc[df['type']==3, 'hp'] = 50
    
    return df

train = preprocess(train)
test = preprocess(test)

df = pd.concat([train, test])

scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df)

pca = PCA(n_components=3, random_state=DEFAULT_RANDOM_SEED)
pcaComponents = pca.fit_transform(df.loc[:, train_cols])
df['pca_1'] = pcaComponents[::,0]
df['pca_2'] = pcaComponents[::,1]
df['pca_3'] = pcaComponents[::,2]

tsne = TSNE(n_components=3, random_state=DEFAULT_RANDOM_SEED)
tsneComponents = tsne.fit_transform(df.loc[:, train_cols])
df['tsne_1'] = tsneComponents[::,0]
df['tsne_2'] = tsneComponents[::,1]
df['tsne_3'] = tsneComponents[::,2]

embedding = umap.UMAP().fit_transform(df.loc[:, train_cols])
df['umap_1'] = embedding[::,0]
df['umap_2'] = embedding[::,1]

train = df[:len(train)]
test = df[len(train):]

In [80]:
train

Unnamed: 0,air_inflow,air_end_temp,motor_current,motor_rpm,motor_temp,motor_vibe,type,sum_temp,sum,inflow/air_end_temp,hp,pca_1,pca_2,pca_3,tsne_1,tsne_2,tsne_3,umap_1,umap_2
0,-0.442693,-0.923779,-0.518231,-1.008446,-0.915632,-0.457660,-1.326376,-0.920743,-0.512425,-0.151784,0.181108,-1.845911,-0.456029,1.152300,-8.454716,13.375363,17.750530,-7.183955,-3.613902
1,0.733147,1.178559,0.822721,1.117478,1.178598,0.390653,-1.326376,1.179609,0.815962,0.435548,0.181108,2.191836,-1.306302,0.713242,25.730164,0.431102,-2.653022,2.084414,-11.267007
2,-0.170034,-0.430397,-0.203066,-0.509683,-0.424313,-0.261100,-1.326376,-0.427903,-0.200513,0.024206,0.181108,-0.900354,-0.657204,1.047400,-2.910603,10.247701,-1.249830,-5.035185,12.542491
3,0.221912,0.264249,0.239666,0.192658,0.266886,0.018223,-1.326376,0.265724,0.238372,0.231687,0.181108,0.434327,-0.937081,0.902758,16.879719,-14.562894,-0.465134,9.869688,-0.514455
4,-0.178555,-0.439598,-0.209069,-0.518407,-0.433340,-0.261100,-1.326376,-0.437029,-0.206724,0.016602,0.181108,-0.917983,-0.653918,1.050181,-3.057412,10.580450,-1.116605,-5.029103,12.546736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2458,0.145227,0.134290,0.157123,0.061787,0.137931,-0.033504,1.813115,0.136126,0.156254,0.192594,0.181108,0.363894,1.193739,-1.327516,-2.535730,-17.927219,-10.821601,11.244542,-11.743981
2459,-0.059267,-0.240634,-0.082253,-0.317739,-0.236039,-0.178337,1.813115,-0.238676,-0.080441,0.090981,0.181108,-0.352600,1.349210,-1.244651,-5.021878,-16.403145,-0.390617,-8.124802,-1.329371
2460,-0.783516,-1.528719,-0.903933,-1.620631,-1.519141,-0.695601,1.813115,-1.525537,-0.894725,-0.436324,0.181108,-2.827434,1.869618,-0.975233,-23.529856,-2.250636,11.145371,10.170506,-9.943210
2461,-0.766475,-1.498817,-0.885174,-1.590094,-1.489482,-0.685256,1.813115,-1.495724,-0.876093,-0.419977,0.181108,-2.770471,1.856986,-0.982365,-24.055138,-2.716349,10.554518,10.219328,-10.037124


In [81]:
train.loc[:, 'LABEL'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [82]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train, test_size=0.2, random_state=DEFAULT_RANDOM_SEED)

In [83]:
inputs = len(train_df.columns)-1
inputs

19

In [84]:
ans = pd.read_csv('./answer_sample.csv')
ans

Unnamed: 0,type,label
0,0,-1
1,0,-1
2,0,-1
3,0,-1
4,0,-1
...,...,...
7384,7,-1
7385,7,-1
7386,7,-1
7387,7,-1


In [157]:
np.sum((test_pred_proba, test_pred_proba2, test_pred_proba3, test_pred_proba4, test_pred_proba5), axis=0)/5

array([-3.07672628, -2.85893911, -2.69907274, ..., -2.68284396,
       -2.70695898, -2.82144935])

In [158]:
from pyod.models import lunar
from pyod.models import gmm
from pyod.models import loda
from pyod.models import alad
from pyod.models import rod


def make_model(train, test):
    types = train['type'].unique()
    test['ans'] = 0
    for i,t in enumerate(types):
        train_data = train.loc[train['type']==t, train.columns[:-1].drop('type')].reset_index(drop=True)
        test_data = test.loc[test['type']==t, test.columns[:-1].drop('type')].reset_index(drop=True)
        
        clf = lunar.LUNAR(proportion=1e-1)
        clf.fit(train_data)
        test_pred_proba = clf.decision_function(test_data)

        clf = gmm.GMM()
        clf.fit(train_data)
        test_pred_proba2 = clf.decision_function(test_data)

        clf = loda.LODA()
        clf.fit(train_data)
        test_pred_proba3 = clf.decision_function(test_data)

        clf = alad.ALAD()
        clf.fit(train_data)
        test_pred_proba4 = clf.decision_function(test_data)

        clf = rod.ROD()
        clf.fit(train_data)
        test_pred_proba5 = clf.decision_function(test_data)

        test.loc[test['type']==t, 'ans'] = np.sum((test_pred_proba, test_pred_proba2, test_pred_proba3, test_pred_proba4, test_pred_proba5), axis=0)/5
    return test

In [159]:
test = make_model(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ans'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [163]:
test.ans

0      -9.247516
1      -8.728594
2      -9.162771
3      -9.070850
4      -8.504998
          ...   
7384   -9.398176
7385   -9.239890
7386   -8.963138
7387   -9.075601
7388   -8.633162
Name: ans, Length: 7389, dtype: float64

In [164]:
ensem = np.where(test.ans>1, 1, 0)
ensem.sum()

337

In [165]:
ans['label'] = ensem
ans.to_csv('./kkr_ensem.csv', index=False)
ans['label'].sum()

337

In [178]:
ensem = np.array(test.ans)
threshold = np.percentile(ensem, 95)
preds = [0 if i < threshold else 1 for i in ensem]
sum(preds)

370

In [179]:
ans['label'] = preds
ans

Unnamed: 0,type,label
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
7384,7,0
7385,7,0
7386,7,0
7387,7,0


In [180]:
ans.to_csv('./kkr_ensem_q.csv', index=False)
ans['label'].sum()

370

In [2]:
%rm 957-Copy1.ipynb