In [None]:
import numpy as np      
import pandas as pd       
import matplotlib.pyplot as plt   
import seaborn as sns
import time
import re
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
train = pd.read_csv(path + 'news_train.csv')
test = pd.read_csv(path + 'news_test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train['review'] = (train['title'].map(str) +' '+ train['content']).apply(lambda row: row.strip())
test['review'] = (test['title'].map(str) +' '+ test['content']).apply(lambda row: row.strip())

In [None]:
train.head()

Unnamed: 0,n_id,date,title,content,ord,info,review
0,NEWS02580,20200605,[마감]코스닥 기관 678억 순매도,[이데일리 MARKETPOINT]15:32 현재 코스닥 기관 678억 순매도,1,0,[마감]코스닥 기관 678억 순매도 [이데일리 MARKETPOINT]15:32 현재...
1,NEWS02580,20200605,[마감]코스닥 기관 678억 순매도,"""실적기반"" 저가에 매집해야 할 8월 급등유망주 TOP 5 전격공개",2,1,"[마감]코스닥 기관 678억 순매도 ""실적기반"" 저가에 매집해야 할 8월 급등유망주..."
2,NEWS02580,20200605,[마감]코스닥 기관 678억 순매도,"하이스탁론, 선취수수료 없는 월 0.4% 최저금리 상품 출시",3,1,"[마감]코스닥 기관 678억 순매도 하이스탁론, 선취수수료 없는 월 0.4% 최저금..."
3,NEWS02580,20200605,[마감]코스닥 기관 678억 순매도,종합 경제정보 미디어 이데일리 - 무단전재 & 재배포 금지,4,0,[마감]코스닥 기관 678억 순매도 종합 경제정보 미디어 이데일리 - 무단전재 & ...
4,NEWS09727,20200626,"롯데·공영 등 7개 TV 홈쇼핑들, 동행세일 동참",전국적인 소비 붐 조성에 기여할 예정,1,0,"롯데·공영 등 7개 TV 홈쇼핑들, 동행세일 동참 전국적인 소비 붐 조성에 기여할 예정"


In [None]:
# Define the dataset
X_train = train['review'].values.astype('str')
y_train = train['info']
X_test = test['review'].values.astype('str')

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_val.shape)

print(y_train.shape)
print(y_val.shape)

(94996,)
(23749,)
(94996,)
(23749,)


## TfidfVectorizer - PassiveAggressiveClassifier

In [None]:
# Time Check
start = time.time()

# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, ngram_range=(1,2))

# Fit & transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_val = tfidf_vectorizer.transform(X_val)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Initialize the PassiveAggressiveClassifier and fit trainig sets
pa_classifier = PassiveAggressiveClassifier(C = 0.01, max_iter=1000, loss = 'squared_hinge', early_stopping=True, validation_fraction=0.1)

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 
              'max_iter' : [100, 1000, 10000],
              'loss': ['hinge', 'squared_hinge'], 
              'validation_fraction': [0.1, 0.2, 0.3]}

model = GridSearchCV(pa_classifier, param_grid, scoring = 'accuracy')
model.fit(tfidf_train, y_train)
print(model.best_score_)
print(model.best_params_)
y_pred = model.predict(tfidf_test)

# Predict the Validation Dataset 
pred_val = model.predict(tfidf_val)
print(f'Validation accuracy: {accuracy_score(y_val, pred_val)}')

matrix = confusion_matrix(y_val, pred_val)
score = 1-(matrix[0][1]+matrix[1][0])/(len(X_val))
print(matrix)
print(f'대회 Accuracy: {round(score*100,3)}%')

# Predict and make submission.csv
df = pd.DataFrame({'id': test['id'], 'info': y_pred})
df.to_csv(path + "sample_submission_whole.csv", mode='w', index=False)

# Print time
print("time: {:.4f}".format(time.time()-start))

0.9807149589590587
{'C': 0.1, 'loss': 'squared_hinge', 'max_iter': 100, 'validation_fraction': 0.1}
Validation accuracy: 0.9852625373699945
[[14352   121]
 [  229  9047]]
대회 Accuracy: 99.705%
time: 127.7239
