In [None]:
!pip install pycaret
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
np.random.seed(0)

from pycaret.regression import *
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

from tqdm.notebook import tqdm
from konlpy.tag import Kkma

import torch
import os, re

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식', '중식계', '석식계']
test.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식']

In [None]:
train[['재택근무자', '중식계', '석식계']] = train[['재택근무자', '중식계', '석식계']].astype('int')
test['재택근무자'] = test['재택근무자'].astype('int')

train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

In [None]:
train['석식계'].max()

In [None]:
train['년'] = train['일자'].dt.year
train['월'] = train['일자'].dt.month
train['일'] = train['일자'].dt.day
train['주'] = train['일자'].dt.week

test['년'] = test['일자'].dt.year
test['월'] = test['일자'].dt.month
test['일'] = test['일자'].dt.day
test['주'] = test['일자'].dt.week

### 자연어처리

In [None]:
!pip install scikit-learn

In [None]:
stopwords = ['쌀밥', '찰현미밥','현미밥', '흑미밥', '수수밥', '검정콩밥', '차조밥',
             '기장밥', '귀리밥', '강낭콩밥', '찰보리밥', '배추김치', '겉절이김치', '깍두기',
             '석박지', '봄동겉절이', '양상추샐러드', '잡곡밥', '포기김치', '무침', '쌀']

In [None]:
def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '쌀밥' in i or '찰현미밥' in i or '현미밥' in i or '흑미밥' in i or '수수밥' in i or '검정콩밥' in i or '차조밥' in i or '기장밥' in i or '귀리밥' in i or '강낭콩밥' in i or '찰보리밥' in i or '배추김치' in i or '겉절이김치' in i or '깍두기' in i or '잡곡밥' in i or '포기김치' in i:
            continue
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return ','.join(x_)

train['중식메뉴_split'] = train['중식'].apply(lambda x: get_food_embedding(x))
train['석식메뉴_split'] = train['석식'].apply(lambda x: get_food_embedding(x))

In [None]:
test['중식메뉴_split'] = test['중식'].apply(lambda x: get_food_embedding(x))
test['석식메뉴_split'] = test['석식'].apply(lambda x: get_food_embedding(x))

In [None]:
train['중식메뉴_split'][0]

In [None]:
lunch_lst = []
lunch_count = []
pref = 0
for i in tqdm(range(1205)):
  try:
    if train['중식계'][i] >= 880:
      pref = 1
    else:
      pref = 0

    lunch_lst.append(train['중식메뉴_split'][i])
    lunch_count.append(pref)
  except:
    pass
for i in range(len(lunch_lst)):
  lunch_lst[i] = lunch_lst[i].replace(',', ' ')
print('done')

In [None]:
dinner_lst = []
dinner_count = []
pref_d = 0
for i in tqdm(range(1205)):
  try:
    if train['석식계'][i] > 476:
      pref_d = 1
    else:
      pref_d = 0

    dinner_lst.append(train['석식메뉴_split'][i])
    dinner_count.append(pref_d)
  except:
    pass
for i in range(len(dinner_lst)):
  dinner_lst[i] = dinner_lst[i].replace(',', ' ')
print('done')

In [None]:

# if '밥' in lunch_lst[0]:
#   lunch_lst[0].pop('밥')

In [None]:
lunch_lst_test = []
lunch_count_test = []
for i in tqdm(range(50)):
  try:
    lunch_lst_test.append(test['중식메뉴_split'][i])
    lunch_count_test.append(0)
  except:
    pass
for i in range(len(lunch_lst_test)):
  lunch_lst_test[i] = lunch_lst_test[i].replace(',', ' ')
print('done')

In [None]:
dinner_lst_test = []
dinner_count_test = []
for i in tqdm(range(50)):
  try:
    dinner_lst_test.append(test['석식메뉴_split'][i])
    dinner_count_test.append(0)
  except:
    pass
for i in range(len(dinner_lst_test)):
  dinner_lst_test[i] = dinner_lst_test[i].replace(',', ' ')
print('done')

In [None]:
lunch_df = pd.DataFrame({'중식':lunch_lst, '선호':lunch_count})
dinner_df = pd.DataFrame({'석식':dinner_lst, '선호':dinner_count})

In [None]:
lunch_df_t = pd.DataFrame({'중식':lunch_lst_test, '선호':lunch_count_test})
dinner_df_t = pd.DataFrame({'석식':dinner_lst_test, '선호':dinner_count_test})

토크나이저 + 텐서플로우

In [None]:
stopwords = ['쌀밥', '찰현미밥','현미밥', '흑미밥', '수수밥', '검정콩밥', '차조밥',
             '기장밥', '귀리밥', '강낭콩밥', '찰보리밥', '배추김치', '겉절이김치', '깍두기',
             '석박지', '봄동겉절이', '양상추샐러드', '잡곡밥', '포기김치', '무침', 'ㄴ', 'ㄹ', 'd', '차', '이', '장', '아', '되',
             '맵', '타', '리', '채', '소', '가', ')', '어', '(', '오', '사', '쯔', '순', '실', '커', '프리', '란', '깻', '쫄']

In [None]:
# from konlpy.tag import Kkma

# kkma = Kkma()
# X_train = []
# for sentence in tqdm(lunch_df['중식']):
#   X_train.append([word for word in kkma.morphs(sentence) if not word in stopwords])

In [None]:
# X_test = []
# for sentence in lunch_df_t['중식']:
#   X_test.append([word for word in kkma.morphs(sentence) if not word in stopwords])

In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(X_train)
# print(tokenizer.word_index)

In [None]:
# tokenizer = Tokenizer(951, oov_token='OOV')
# tokenizer.fit_on_texts(X_train)
# X_train = tokenizer.texts_to_sequences(X_train)
# X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# y_train = np.array(lunch_df['선호'])
# y_test = np.array(lunch_df_t['선호'])

In [None]:
# drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

# X_train = np.delete(X_train, drop_train, axis = 0)
# y_train = np.delete(y_train, drop_train, axis = 0)

# print(len(X_train))
# print(len(y_train))

In [None]:
# print('메뉴 최대길이', max(len(l) for l in X_train))
# print('메뉴 평균길이', sum(map(len, X_train))/len(X_train))

In [None]:
# import matplotlib.pyplot as plt

# plt.hist([len(s)for s in X_train], bins=50)

In [None]:
# max_len = 20
# X_train = pad_sequences(X_train, maxlen=max_len)
# X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
# from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.callbacks import ModelCheckpoint

# checkpoint_path = 'my_checkpoint.ckpt'
# checkpoint = ModelCheckpoint(checkpoint_path, 
#                              save_weights_only=True, 
#                              save_best_only=True, 
#                              monitor='val_loss',
#                              verbose=1)

# model = Sequential()
# model.add(Embedding(951, 100))
# model.add(LSTM(128, return_sequences=True))
# model.add(LSTM(64))
# model.add(Dropout(0.5))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(1, activation = 'sigmoid'))

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['acc'])
# model.summary()

In [None]:
# history = model.fit(X_train, y_train, epochs = 15, batch_size = 32, callbacks=[checkpoint], validation_split = 0.2)
# model.load_weights(checkpoint_path)

In [None]:
# model.evaluate(X_test, y_test)

In [None]:
# hist_dict = history.history
# loss = hist_dict['loss']
# val_loss = hist_dict['val_loss']
# acc = hist_dict['acc']
# val_acc = hist_dict['val_acc']
# plt.plot(loss, 'b--', label='training loss')
# plt.plot(val_loss, 'r:', label='validation loss')
# plt.legend()
# plt.grid()

# plt.figure()
# plt.plot(acc, 'b--', label = 'training acc')
# plt.plot(val_acc, 'r:', label='validation acc')
# plt.legend()
# plt.grid()

# plt.show()

In [None]:
# def sentiment_predict(new_sentence):
#   new_token = [word for word in kkma.morphs(new_sentence) if not word in stopwords]
#   new_sequences = tokenizer.texts_to_sequences([new_token])
#   new_pad = pad_sequences(new_sequences, maxlen=max_len)
#   score = float(model.predict(new_pad))

#   if score > 0.5:
#     print("{} 선호({:.2f}%)".format(new_sentence, score*100))
#   else:
#     print("{} 부정({:.2f}%)".format(new_sentence, (1-score)*100))

In [None]:
# sentiment_predict('요구르트 계란찜 쇠불고기 오징어찌개 잡곡밥 청포묵무침 포기김치 쌀밥')

그리드서치 회귀

In [None]:
from konlpy.tag import Kkma

In [None]:
kkma = Kkma()

In [None]:
def tok(text):
  t = kkma.nouns(text)
  return t

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

vect = TfidfVectorizer(tokenizer = tok, ngram_range = (1,2), min_df = 3, max_df = 0.9)
vect.fit(lunch_df['중식'])
vect_matrix_train = vect.transform(lunch_df['중식'])

In [None]:
vect_d = TfidfVectorizer(tokenizer = tok, ngram_range = (1,2), min_df = 3, max_df = 0.9)
vect_d.fit(dinner_df['석식'])
vect_d_matrix_train = vect_d.transform(dinner_df['석식'])

In [None]:
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(vect_matrix_train, lunch_df['선호'], test_size = 0.2, random_state = 42)
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(vect_d_matrix_train, dinner_df['선호'], test_size = 0.2, random_state = 42)

In [None]:
lg_clf = LGBMClassifier(n_estimators = 10000, n_jobs=4, max_depth=6, learning_rate=0.0005, num_leaves=2^8-1)
lg_clf_d = LGBMClassifier(n_estimators = 10000, n_jobs=4, max_depth=8, learning_rate=0.0003, num_leaves=2^7-1)

In [None]:
evals_l = [(X_test_l, y_test_l)]
evals_d = [(X_test_d, y_test_d)]

In [None]:
lg_clf.fit(X_train_l, y_train_l, early_stopping_rounds = 1000, eval_metric = 'logloss',
           eval_set = evals_l, verbose = True)

In [None]:
lg_clf_d.fit(X_train_d, y_train_d, early_stopping_rounds = 500, eval_metric = 'logloss',
           eval_set = evals_d, verbose = True)

In [None]:
vect_matrix_test = vect.transform(lunch_df_t['중식'])
preds = lg_clf.predict(vect_matrix_test)

In [None]:
vect_d_matrix_test = vect_d.transform(dinner_df_t['석식'])
preds_d = lg_clf_d.predict(vect_d_matrix_test)

In [None]:
lunch_df_t['선호'] = preds
dinner_df_t['선호'] = preds_d

In [None]:
lunch_df_t

In [None]:
train['중식선호도'] = lunch_count
test['중식선호도'] = lunch_count_test
train['석식선호도'] = dinner_count
test['석식선호도'] = dinner_count_test

In [None]:
train['요일'] = train['일자'].dt.weekday
train['야근_가능'] = train['요일'].apply(lambda x : 1 if (x==2) or (x==4) else 0)
train['출근인원'] = train['정원']-(train['휴가자']+train['출장자']+train['재택근무자'])
train['휴가비율'] = train['휴가자']/train['정원']
train['출장비율'] = train['출장자']/train['정원']
train['야근비율'] = train['야근자']/train['출근인원']
train['재택비율'] = train['재택근무자']/train['정원']

test['요일'] = test['일자'].dt.weekday
test['야근_가능'] = test['요일'].apply(lambda x : 1 if (x==2) or (x==4) else 0)
test['출근인원'] = test['정원']-(test['휴가자']+test['출장자']+test['재택근무자'])
test['휴가비율'] = test['휴가자']/test['정원']
test['출장비율'] = test['출장자']/test['정원']
test['야근비율'] = test['야근자']/test['출근인원']
test['재택비율'] = test['재택근무자']/test['정원']

**정규화 BUT 휴가 전 후 고려하면 X**

In [None]:
from scipy import stats
train['z'] = stats.zscore(train['휴가자'])
train['zscale'] = stats.zscore(train['석식계'])
train['zscale_c'] = stats.zscore(train['출장비율'])
train['zscale_y'] = stats.zscore(train['야근비율'])

train = train[train['zscale'].between(-1.96, 1.96)]
train = train[train['zscale_c'].between(-1.96, 1.96)]
train = train[train['zscale_y'].between(-1.96, 1.96)]
train = train[train['z'].between(-1.96, 1.96)]

In [None]:
a = train.groupby(train['출장비율'])['석식계'].mean()
a.plot()

In [None]:
train = train.drop(columns=['z', 'zscale', 'zscale_c', 'zscale_y', '중식메뉴_split', '석식메뉴_split'])

In [None]:
rain_2016 = pd.read_csv('충무공동_강수_201602_201612.csv')

In [None]:
rain_2016.info()

In [None]:
rain_2016[(rain_2016['hour'] == 1100) & (rain_2016['value location:81_75 Start : 20160201 '] > 1)]

In [None]:
train_1 = train[['일자', '요일', '월', '년', '휴가자', '휴가비율', '출장비율','중식선호도', '중식계','출근인원']]
train_2 = train[['일자', '요일', '월', '년', '휴가자', '출근인원', '야근_가능','석식선호도', '휴가비율','출장비율', '야근비율', '석식계']]

In [None]:
reg = setup(data=train_1,
            target='중식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
compare_models()

In [None]:
best_5 = compare_models(sort='MAE', n_select=5)

In [None]:
blended = blend_models(estimator_list= best_5, fold=5, optimize='MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred1 = predict_model(final_model, test)


In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['중식계'] = pred1['Label']

In [None]:
reg = setup(data=train_2,
            target='석식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
compare_models()

In [None]:
best_5 = compare_models(sort = 'MAE', n_select = 5)

In [None]:
blended = blend_models(estimator_list = best_5, fold = 5, optimize = 'MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred2 = predict_model(final_model, test)

In [None]:
pred2['Label']

In [None]:
submission['석식계'] = pred2['Label']

In [None]:
submission

In [None]:
submission.to_csv('sub_4968_3099_lightgbm선호도처리2.csv', index=False)