In [21]:
import re
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE=42
target = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u1128714\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def metrics_classification(forest, X_test, y_test, val=None):
    print(val, ":")
    print("====================model valuation==================")

    #print("\n混淆矩阵为：")
    #print(metrics.confusion_matrix(y_test, forest.predict(X_test)))

    #print("\n准确率、召回率和F1值为：")
    #print(metrics.classification_report(y_test, forest.predict(X_test)))
        
    print("\nAUC Score为：")
    y_predprob = forest.predict_proba(X_test)[:,1]
    print("%.2f" % roc_auc_score(y_test, y_predprob, multi_class='ovo'))

    print("\naccuracy:")
    print("%.2f" % accuracy_score(y_test, forest.predict(X_test)))
    print("\ncross_val_score:")
    print("%.2f" % cross_val_score(forest, X_test, y_test, cv=3).mean())
    print("\n")

In [23]:
def model_training(X, target, df, test_size=0.2, RANDOM_STATE=42): 
    if isinstance(target, list):
        print('multi-target')
    #multi-target
        for val in target:
            X_train, X_test, y_train, y_test = train_test_split(X, df[val], test_size=test_size, random_state=RANDOM_STATE)
            forest = RandomForestClassifier(oob_score=True,n_estimators = 600, max_depth=7).fit(X_train, y_train)
            XGBoost = GradientBoostingClassifier().fit(X_train, y_train)
            for model in [forest, XGBoost]:
                print('model:', model)          
                metrics_classification(val=val, forest=model, X_test=X_test, y_test=y_test)        
    else:
        print('single-target')
        X_train, X_test, y_train, y_test = train_test_split(X, df[target], test_size=test_size, random_state=RANDOM_STATE)
        forest = RandomForestClassifier(oob_score=True,n_estimators = 600, max_depth=7).fit(X_train, y_train)
        XGBoost = GradientBoostingClassifier().fit(X_train, y_train)
        for model in [forest, XGBoost]:
            print('model:', model)          
            metrics_classification(forest=model, X_test=X_test, y_test=y_test)

In [24]:
forest = RandomForestClassifier(oob_score=True,n_estimators = 600, max_depth=7)
XGBoost = GradientBoostingClassifier()
for model in [forest, XGBoost]:
    print('model:', model)

model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
model: GradientBoostingClassifier()


# Importing Data

In [25]:
# escapechar='\\'用来去掉转义字符'\'
data_raw = pd.read_csv('essays.csv')
print('Number of reviews: {}'.format(len(data_raw)))
data_raw = data_raw[['text', 'cEXT','cNEU','cAGR','cCON','cOPN']]
data_raw.head()

Number of reviews: 2467


Unnamed: 0,text,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,I can't believe it! It's really happening! M...,y,n,y,y,n
4,"Well, here I go with the good old stream of co...",y,n,y,n,y


# Data Preprocessing

In [26]:
data = data_raw.copy()
eng_stopwords = stopwords.words('english')
# 去掉html标签, 非英文字符, 去停用词, 重新组合为句子
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)

data['clean_review'] = data.text.apply(clean_text)
data.head()

Unnamed: 0,text,cEXT,cNEU,cAGR,cCON,cOPN,clean_review
0,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y,well right woke mid day nap sort weird ever si...
1,"Well, here we go with the stream of consciousn...",n,n,y,n,n,well go stream consciousness essay used things...
2,An open keyboard and buttons to push. The thin...,n,y,n,y,y,open keyboard buttons push thing finally worke...
3,I can't believe it! It's really happening! M...,y,n,y,y,n,believe really happening pulse racing like mad...
4,"Well, here I go with the good old stream of co...",y,n,y,n,y,well go good old stream consciousness assignme...


In [27]:
# VSM抽取文本特征
# 统计词频，作为文本特征，计算文本-单词矩阵
def VSM_text_feature(X, max_features = 800):
    vectorizer_freq = CountVectorizer(max_features = max_features) 
    train_vsm_freq = vectorizer_freq.fit_transform(X).toarray()
    print("以词频为元素的文本-单词矩阵的维度是: ", train_vsm_freq.shape)
    return train_vsm_freq

In [28]:
def tfidf_text_feature(X, max_features = 800):
    vectorizer_tfidf = TfidfVectorizer(max_features = max_features) 
    train_vsm_tfidf = vectorizer_tfidf.fit_transform(X).toarray()
    print("以tfidf为元素的文本-单词矩阵的维度是: ", train_vsm_tfidf.shape)
    return train_vsm_tfidf

In [29]:
def LSA_text_feature(X, n_components=300):
    # 对以词频为特征的单词-文本矩阵进行NMF分解
    nmf = NMF(n_components=n_components)
    # 得到话题-文本矩阵，注意如果输入进行了转置，那么得到的是单词-话题矩阵
    train_lsa_freq = nmf.fit_transform(VSM_text_feature(X)) 
    print("话题-文本矩阵的维度是: ",train_lsa_freq.shape)
    return train_lsa_freq

In [30]:
"""使用sklearn计算2-gram，得到词语-文本矩阵"""
def Ngram_text_feature(X):
    # token_pattern的作用是，出现"bi-gram"、"two:three"这种时，可以切成"bi gram"、"two three"的形式
    vectorizer_2gram = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b',max_features=700) 
    train_vsm_2gram = vectorizer_2gram.fit_transform(X).toarray()
    print("2-gram构成的语料库中前10个元素为：")
    print(vectorizer_2gram.get_feature_names()[:10])
    return train_vsm_2gram

In [31]:
vsm = VSM_text_feature(X=data.clean_review)
tf_idf = tfidf_text_feature(X=data.clean_review)
lsa = LSA_text_feature(X=data.clean_review, n_components=300)
n_gram =  Ngram_text_feature(X=data.clean_review)

以词频为元素的文本-单词矩阵的维度是:  (2467, 800)
以tfidf为元素的文本-单词矩阵的维度是:  (2467, 800)
以词频为元素的文本-单词矩阵的维度是:  (2467, 800)
话题-文本矩阵的维度是:  (2467, 300)
2-gram构成的语料库中前10个元素为：
['able get', 'able go', 'act like', 'almost done', 'also need', 'also want', 'always feel', 'always get', 'always seem', 'always think']


In [36]:
list_text_feature = [vsm, tf_idf, lsa, n_gram]
for text_feature_method in list_text_feature:
    print('=========================================================')
    model_training(text_feature_method, target, df=data)
    print('=========================================================')

multi-target
model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
cEXT :

AUC Score为：
0.41

accuracy:
0.59

cross_val_score:
0.55


model: GradientBoostingClassifier()
cEXT :

AUC Score为：
0.46

accuracy:
0.53

cross_val_score:
0.59


model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
cNEU :

AUC Score为：
0.63

accuracy:
0.60

cross_val_score:
0.53


model: GradientBoostingClassifier()
cNEU :

AUC Score为：
0.59

accuracy:
0.55

cross_val_score:
0.55


model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
cAGR :

AUC Score为：
0.53

accuracy:
0.54

cross_val_score:
0.55


model: GradientBoostingClassifier()
cAGR :

AUC Score为：
0.52

accuracy:
0.52

cross_val_score:
0.55


model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
cCON :

AUC Score为：
0.57

accuracy:
0.54

cross_val_score:
0.56


model: GradientBoostingClassifier()
cCON :

AUC Score为：
0.56

accuracy:
0.54

cross_val_score:
0.55


model: 

In [37]:
#test single-target
target = 'cEXT'
model_training(lsa, target, df=data)

single-target
model: RandomForestClassifier(max_depth=7, n_estimators=600, oob_score=True)
None :

AUC Score为：
0.40

accuracy:
0.61

cross_val_score:
0.50


model: GradientBoostingClassifier()
None :

AUC Score为：
0.42

accuracy:
0.57

cross_val_score:
0.53


