In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/2021_2학기/기계학습

In [None]:
!pwd

In [None]:
!ls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# **데이터 분석 및 정제**

In [None]:
train_data = pd.read_csv('./data/train.csv', encoding = 'utf-8')
test_data = pd.read_csv('./data/test.csv', encoding = 'utf-8')
print('train data info :')
print(train_data.info())
print('test data info :')
print(test_data.info())

In [None]:
# check data length
print('train sample count : ', len(train_data))
print('test sample count : ', len(test_data))

In [None]:
# train data preview
train_data[:3]

In [None]:
# test data preview
test_data[:3]

In [None]:
# drop train data's duplicates
train_data.drop_duplicates(subset=['mail'], inplace=True)
print('train sample count (drop dup) : ', len(train_data))

In [None]:
# prepare data (DataFrame -> Series)
label_data = train_data['label']
mail_data = train_data['mail']
mail_test = test_data['mail']
print('train label data info :')
print(type(label_data))
print(len(label_data))
print('train mail data info :')
print(type(mail_data))
print(len(mail_data))
print('test mail data info :')
print(type(mail_test))
print(len(mail_test))

In [None]:
# split data ( train & cv ) (test_size = 0.2)
mail_train, mail_cv, label_train, label_cv = train_test_split(mail_data, label_data, test_size=0.2, random_state=0, stratify=label_data)

### **토큰화 및 정수 인코딩**

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_train)
mail_train_encoded = tokenizer.texts_to_sequences(mail_train)
print(type(mail_train_encoded))
print(mail_train_encoded)

word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1
max_len = max(len(l) for l in mail_train_encoded)
mail_train_padded = pad_sequences(mail_train_encoded, maxlen = max_len)
# mail_train_padded : class 'numpy.ndarray'
print(type(mail_train_padded))
print(mail_train_padded)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_cv)
mail_cv_encoded = tokenizer.texts_to_sequences(mail_cv)
mail_cv_padded = pad_sequences(mail_cv_encoded, maxlen = max_len)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(mail_test)
mail_test_encoded = tokenizer.texts_to_sequences(mail_test)
max_len_test = max(len(l) for l in mail_test_encoded)
mail_test_padded = pad_sequences(mail_test_encoded, maxlen = max_len_test)
print(type(mail_test_padded))
print(mail_test_padded)

In [None]:
print(word_to_index)
print('vocabulary size : {}'.format((vocab_size)))
print('train data shape :', mail_train_padded.shape)
print('cv data shape :', mail_cv_padded.shape)
print('test data shape :', mail_test_padded.shape)