### 基于机器学习的多标签分类任务与情感二分类任务
#### 1. 线性核SVM
#### 2. 随机森林
#### 3. 逻辑回归

In [7]:
import numpy as np
import pandas as pd
import pickle
import scipy

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from Embedding import tfidf, word2vec
from sklearn.metrics import accuracy_score, f1_score

### Multilabel Classification

In [2]:
# text split
with open('data/teatext_preprocessed.pkl', 'rb') as f:
    teatext_preprocessed = pickle.load(f)
train_data, test_data = train_test_split(teatext_preprocessed, test_size=0.2)

x_train, x_test = train_data["rateContent"], test_data["rateContent"]
y_train, y_test = train_data.drop(["rateContent","sentiment"], axis=1), test_data.drop(["rateContent","sentiment"], axis=1)
y_sentiment_train,y_sentiment_test  = train_data["sentiment"], test_data["sentiment"]

tfidf_train, tfidf_test = tfidf(x_train, x_test)
word2vec_train = word2vec(x_train)
word2vec_test = word2vec(x_test)

In [3]:
def estimate(x_test, y_test, classifier):
    """
    :param x_test: series or 1-d array
    :param y_test: DataFrame
    :param classifier: classifier
    :return: acc score and f1 score
    """
    TP, TN, FP, FN = 0, 0, 0, 0
    value = y_test.values
    y_pred = classifier.predict(x_test)
    for i in range(value.shape[0]):
        for j in range(value.shape[1]):
            if value[i][j] == 1 and y_pred[i][j] == 1:
                TP = TP + 1
            elif value[i][j] == 0 and y_pred[i][j] == 0:
                TN = TN + 1
            elif value[i][j] == 0 and y_pred[i][j] == 1:
                FP = FP + 1
            elif value[i][j] == 1 and y_pred[i][j] == 0:
                FN = FN + 1
    f1 = 2 * TP / (2 * TP + FP + FN)

    return classifier.score(x_test,y_test), f1

In [4]:
# linear kernel SVM
clf1 = MultiOutputClassifier(SVC(kernel='linear'))
clf1.fit(tfidf_train,y_train)
acc_1_tfidf, f1_1_tfidf = estimate(tfidf_test,y_test,clf1)

clf1.fit(word2vec_train,y_train)
acc_1_word2vec, f1_1_word2vec = estimate(word2vec_test,y_test,clf1)

In [5]:
# Random Forest
clf2 = MultiOutputClassifier(RandomForestClassifier(random_state=1),
                             n_jobs=-1)
clf2.fit(tfidf_train,y_train)
acc_2_tfidf, f1_2_tfidf = estimate(tfidf_test,y_test,clf2)

clf2.fit(word2vec_train,y_train)
acc_2_word2vec, f1_2_word2vec = estimate(word2vec_test,y_test,clf2)

In [6]:
# Logistic Regression
clf3 = MultiOutputClassifier(LogisticRegression(C=5))

clf3.fit(tfidf_train,y_train)
acc_3_tfidf, f1_3_tfidf = estimate(tfidf_test,y_test,clf3)

clf3.fit(word2vec_train,y_train)
acc_3_word2vec, f1_3_word2vec = estimate(word2vec_test,y_test,clf3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Sentiment Classification

In [9]:
# linear kernel SVM
clf4 = SVC(kernel='linear')
clf4.fit(tfidf_train,y_sentiment_train)
acc_4_tfidf = clf4.score(tfidf_test,y_sentiment_test)
f1_4_tfidf = f1_score(y_sentiment_test,clf4.predict(tfidf_test))

clf4.fit(word2vec_train,y_sentiment_train)
acc_4_word2vec = clf4.score(word2vec_test,y_sentiment_test)
f1_4_word2vec = f1_score(y_sentiment_test,clf4.predict(word2vec_test))

In [10]:
# Random Forest
clf5 = RandomForestClassifier(random_state=1)
clf5.fit(tfidf_train,y_sentiment_train)
acc_5_tfidf = clf5.score(tfidf_test,y_sentiment_test)
f1_5_tfidf = f1_score(y_sentiment_test,clf5.predict(tfidf_test))

clf5.fit(word2vec_train,y_sentiment_train)
acc_5_word2vec = clf5.score(word2vec_test,y_sentiment_test)
f1_5_word2vec = f1_score(y_sentiment_test,clf5.predict(word2vec_test))

In [11]:
# Logistic Regression
clf6 = LogisticRegression(C=5)
clf6.fit(tfidf_train,y_sentiment_train)
acc_6_tfidf = clf6.score(tfidf_test,y_sentiment_test)
f1_6_tfidf = f1_score(y_sentiment_test,clf6.predict(tfidf_test))

clf6.fit(word2vec_train,y_sentiment_train)
acc_6_word2vec = clf6.score(word2vec_test,y_sentiment_test)
f1_6_word2vec = f1_score(y_sentiment_test,clf6.predict(word2vec_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
