## Lgistic Regression with Word2Vec(+stemming)

In [1]:
import os
import re

import pandas as pd
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
data_in_path = './data_in/'
data_out_path = './data_out/'
train_data = 'train_data.csv'
test_data = 'test_data.csv'

random_seed = 10
test_split = 0.2

In [3]:
train_data = pd.read_csv(data_in_path + train_data)

In [4]:
train_clause = list(train_data['clauses'])
train_label = list(train_data['label'])

In [5]:
sentences = []
for clauses in train_clause:
    sentences.append(clauses.split())

In [6]:
num_features = 100
min_word_count = 40
num_workers = 6
context = 2
downsampling = 1e-3

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [9]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2019-11-27 16:10:28,120 : INFO : collecting all words and their counts
2019-11-27 16:10:28,122 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-27 16:10:28,130 : INFO : collected 2139 word types from a corpus of 24305 raw words and 1659 sentences
2019-11-27 16:10:28,131 : INFO : Loading a fresh vocabulary
2019-11-27 16:10:28,133 : INFO : effective_min_count=40 retains 131 unique words (6% of original 2139, drops 2008)
2019-11-27 16:10:28,134 : INFO : effective_min_count=40 leaves 12552 word corpus (51% of original 24305, drops 11753)
2019-11-27 16:10:28,135 : INFO : deleting the raw counts dictionary of 2139 items
2019-11-27 16:10:28,135 : INFO : sample=0.001 downsamples 131 most-common words
2019-11-27 16:10:28,136 : INFO : downsampling leaves estimated 5907 word corpus (47.1% of prior 12552)
2019-11-27 16:10:28,136 : INFO : estimated required memory for 131 words and 100 dimensions: 170300 bytes
2019-11-27 16:10:28,137 : INFO : resetting layer weight

In [10]:
model_name = '1121-stem_word2vec_model'
model.save(data_out_path+model_name)

2019-11-27 16:10:28,244 : INFO : saving Word2Vec object under ./data_out/1121-stem_word2vec_model, separately None
2019-11-27 16:10:28,245 : INFO : not storing attribute vectors_norm
2019-11-27 16:10:28,245 : INFO : not storing attribute cum_table
2019-11-27 16:10:28,249 : INFO : saved ./data_out/1121-stem_word2vec_model


In [11]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float64)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [12]:
def get_dataset(clauses, model, num_features):
    dataset = list()

    for s in clauses:
        dataset.append(get_features(s, model, num_features))

    clauseFeatureVecs = np.stack(dataset)
    
    return clauseFeatureVecs

In [13]:
train_data_vecs = get_dataset(sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.


In [14]:
from sklearn.model_selection import train_test_split
import numpy as np

vec = train_data_vecs
lab = np.array(train_label)

vec = np.nan_to_num(vec)  # NaN 값을 0으로 모두 치환

vec_train, vec_eval, lab_train, lab_eval = train_test_split(vec, lab, test_size=test_split, random_state=random_seed)

In [15]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(class_weight = 'balanced')

log.fit(vec_train, lab_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
print("Accuracy of training: %f" % log.score(vec_train, lab_train))
print("Accuracy: %f" % log.score(vec_eval, lab_eval))

Accuracy of training: 0.596835
Accuracy: 0.596386


## 여기서부터 테스트

In [17]:
test_data = pd.read_csv(data_in_path + test_data)

test_clause = list(test_data['clauses'])

In [18]:
test_clauses = []

for clauses in test_clause:
    test_clauses.append(clauses.split())

In [19]:
test_data_vecs = get_dataset(test_clauses, model, num_features)
test_vecs = np.nan_to_num(test_data_vecs)

  # Remove the CWD from sys.path while we load stuff.


In [20]:
# 위에서 만든 랜덤 포레스트 분류기를 통해 예측값을 가져온다.
result = log.predict(test_vecs)

In [21]:
import os
# 테스트 데이터 파일에 쓰기 -

test_clauses = list(test_data['clauses'])
test_label = list(test_data['label'])

if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

# 판다스 데이터 프레임을 통해 데이터를 구성해서 output에 넣는다.
output = pd.DataFrame( data={"label": test_label ,  "predict": result} )

# 이제 csv파일로 만든다.
output.to_csv(data_out_path + "lg-w2v-predict-test-stem.csv", index=False, quoting=3 )

In [22]:
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(lab_eval, (log.predict_proba(vec_eval)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % log.score(test_vecs, test_data['label']))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(test_data['label'], result))
print("Recall: %f" % metrics.recall_score(test_data['label'], result))
print("F1-Score: %f" % metrics.f1_score(test_data['label'], result))

------------
Accuracy: 0.567633
Precision: 0.615702
Recall: 0.634043
F1-Score: 0.624738
