#資料集準備

In [None]:
import json
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd

In [None]:
input_file = open(data_path + 'Data/' + 'df_depressive.json', 'r') #資料位置
data = json.load(input_file)
df_depressive = json_normalize(data) #json -> dataframe
df_depressive.head()

In [None]:
#刪除不須分析的欄位
df_depressive = df_depressive[['Depressive', 'Emotion', 'Format_Content']]
df_depressive.head()

In [None]:
myfilter = (df_depressive['Depressive']==1) & (df_depressive['Emotion'].isna()==False)
df_depressive[myfilter].head()

In [None]:
df_depressive[myfilter]['Emotion'].value_counts()

0.0    8859
1.0    2910
Name: Emotion, dtype: int64

In [None]:
df_depressive = df_depressive[myfilter]

In [None]:
del df_depressive['Depressive']

#資料描述

In [None]:
#因兩組別樣本數差異較大，因此在模型訓練時調整參數class_weight='balanced'
df_depressive['Emotion'].value_counts()

0.0    8859
1.0    2910
Name: Emotion, dtype: int64

#SVM

In [None]:
x = df_depressive.loc[:, 'Format_Content']
y = df_depressive.loc[:, 'Emotion']

In [None]:
from sklearn.model_selection import train_test_split

#切割資料集
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
tfidf = TfidfVectorizer(lowercase=False, preprocessor=None, stop_words=None, use_idf=True, tokenizer=None, analyzer='word')
rbfsvm_tfidf = Pipeline([('vect', tfidf), ('clf', SVC(kernel='rbf', random_state=0, class_weight='balanced'))])

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.98, 0.95, 0.90],
        'vect__min_df': [0.1, 0.005, 0.0025],
        'vect__norm':['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
        }]

gs_rbfsvm_tfidf = GridSearchCV(rbfsvm_tfidf, param_grid, scoring='roc_auc', cv=10, verbose=2, n_jobs=-1)

In [None]:
gs_rbfsvm_tfidf.fit(x_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 80.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 148.5min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 245.2min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 267.9min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        

In [None]:
print('Best parameter set: %s ' % gs_rbfsvm_tfidf.best_params_)
print('CV AUC: %.3f' % gs_rbfsvm_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'vect__max_df': 0.98, 'vect__min_df': 0.0025, 'vect__ngram_range': (1, 1), 'vect__norm': 'l2'} 
CV AUC: 0.868


In [None]:
clf = gs_rbfsvm_tfidf.best_estimator_
print('Test AUC: %.3f' % clf.score(x_test, y_test))

Test AUC: 0.830


In [None]:
pred = clf.predict(x_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

confusion matrix:
[[2380  278]
 [ 322  551]]


#儲存和呼叫模型

In [None]:
#紀錄版本
!pip freeze

absl-py==0.12.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.1.0
appdirs==1.4.4
argon2-cffi==20.1.0
arviz==0.11.2
astor==0.8.1
astropy==4.2.1
astunparse==1.6.3
async-generator==1.10
atari-py==0.2.9
atomicwrites==1.4.0
attrs==21.2.0
audioread==2.1.9
autograd==1.3
Babel==2.9.1
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==3.3.0
blis==0.4.1
bokeh==2.3.3
Bottleneck==1.3.2
branca==0.4.2
bs4==0.0.1
CacheControl==0.12.6
cached-property==1.5.2
cachetools==4.2.2
catalogue==1.0.0
certifi==2021.5.30
cffi==1.14.6
cftime==1.5.0
chardet==3.0.4
charset-normalizer==2.0.2
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorcet==2.0.6
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.3.2
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.6
cvxpy==1.0.31
cycler==0.10.0
cymem==2.0.5
Cython==0.29.23
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
descartes==1.1.0
dill==0.3.4
distributed==1.25.3
d

In [None]:
import pickle

In [None]:
#儲存模型
pkl_filename = data_path + "mymodel.pkl" 
with open(pkl_filename, 'wb') as file:
  pickle.dump(clf, file, protocol=4)

In [None]:
#呼叫模型
clf = pickle.load(open(data_path + 'mymodel.pkl', 'rb'))

#其他演算法

In [None]:
x = df_depressive.loc[:, 'Format_Content']
y = df_depressive.loc[:, 'Emotion']

In [None]:
from sklearn.model_selection import train_test_split

#切割資料集
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics

In [None]:
tfidf = TfidfVectorizer(lowercase=False, preprocessor=None, stop_words=None, use_idf=True, tokenizer=None, analyzer='word')
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0, solver='liblinear', class_weight='balanced'))])

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.95, 0.90, 0.85, 0.80],
        'vect__min_df': [0.01, 0.005],
        'vect__norm':['l1', 'l2'],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
        }]

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=10, verbose=2, n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(x_train, y_train)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 24.9min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 36.5min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 49.0min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__max_df': 0.95, 'vect__min_df': 0.005, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'} 
CV Accuracy: 0.798


In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))

Test Accuracy: 0.804


In [None]:
tfidf = TfidfVectorizer(lowercase=False,preprocessor=None, stop_words=None, use_idf=True, tokenizer=None, analyzer='word')
lrsvm_tfidf = Pipeline([('vect', tfidf), ('clf', SVC(kernel='linear',random_state=0, class_weight='balanced'))])

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.95],
        'vect__min_df': [0.005],
        'vect__norm':['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
        }]

gs_lrsvm_tfidf = GridSearchCV(lrsvm_tfidf, param_grid, scoring='accuracy', cv=10, verbose=2, n_jobs=-1)

In [None]:
gs_lrsvm_tfidf.fit(x_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 38.8min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        

In [None]:
print('Best parameter set: %s ' % gs_lrsvm_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lrsvm_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'vect__max_df': 0.95, 'vect__min_df': 0.005, 'vect__ngram_range': (1, 2), 'vect__norm': 'l1'} 
CV Accuracy: 0.797


In [None]:
clf = gs_lrsvm_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))

Test Accuracy: 0.804


In [None]:
tfidf = TfidfVectorizer(lowercase=False,preprocessor=None, stop_words=None, use_idf=True, tokenizer=None, analyzer='word')
nbM_tfidf = Pipeline([('vect', tfidf), ('clf', MultinomialNB())])

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.95],
        'vect__min_df': [0.005],
        'vect__norm':['l1', 'l2'],
        }]

gs_nbM_tfidf = GridSearchCV(nbM_tfidf, param_grid, scoring='accuracy', cv=10, verbose=2, n_jobs=-1)

In [None]:
gs_nbM_tfidf.fit(x_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   54.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   59.1s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        

In [None]:
print('Best parameter set: %s ' % gs_nbM_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_nbM_tfidf.best_score_)

Best parameter set: {'vect__max_df': 0.95, 'vect__min_df': 0.005, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'} 
CV Accuracy: 0.793


In [None]:
clf = gs_nbM_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))

Test Accuracy: 0.787


#加上時間特徵和發文字數

In [None]:
import json
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd

In [None]:
input_file = open(data_path + 'Data/' + 'df_SpecificFeature.json', 'r') #資料位置
data = json.load(input_file)
df = json_normalize(data) #json -> dataframe
df.head(2)

In [None]:
df = df[['Depressive', 'Emotion', 'Format_Time', 'Hour', 'Week', 'Month', 'Season', 'WordCount','Format_Content']]
df.head()

In [None]:
myfilter = (df['Depressive']==1) & (df['Emotion'].isna()==False)
df = df[myfilter]
df.reset_index(drop=True, inplace=True)
df.head(2)

In [None]:
df['Emotion'].value_counts()

0.0    8859
1.0    2910
Name: Emotion, dtype: int64

In [None]:
#新增時間週期欄位
import math
import datetime

df['Format_Time'] = pd.to_datetime(df['Format_Time'], unit='ms')
df['pickup_datetime'] = df['Format_Time']
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
df['year_cycle'] = df['year_cycle'].map(lambda x:math.cos(x*math.pi))
df['week_cycle'] = df['Week']/3.5 + df['pickup_hour']/84
df['week_cycle'] = df['week_cycle'].map(lambda x:math.sin(x*math.pi))
df.drop(labels=['Format_Time', 'pickup_datetime', 'Hour', 'Month', 'pickup_minute', 'pickup_second'], axis=1, inplace=True)
df.head(2)

In [None]:
y = df.loc[:, 'Emotion']
x = df.drop(labels=['Depressive', 'Emotion'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

#切割資料集
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
NumCol = df.drop(labels=['Format_Content', 'Depressive', 'Emotion'], axis=1).columns
print(NumCol)

Index(['Week', 'Season', 'WordCount', 'pickup_year', 'pickup_month',
       'pickup_day', 'pickup_hour', 'day_cycle', 'year_cycle', 'week_cycle'],
      dtype='object')


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics

In [None]:
tfidf=TfidfVectorizer(lowercase=False, preprocessor=None, stop_words=None, use_idf=True, tokenizer=None, analyzer='word', ngram_range=(1, 1), max_df=0.98, min_df=0.0025, norm='l2')
scaler=MinMaxScaler()
preprocessor=ColumnTransformer(transformers=[
          ('text', tfidf, 'Format_Content'),          
          ('num', scaler, NumCol)
          ])
rbfsvm_tfidf = Pipeline([('preprocessor', preprocessor), ('clf', SVC(kernel='rbf', random_state=0, class_weight='balanced'))])

param_grid = [{
        'clf__C': [1.0, 10.0, 100.0]
        }]

gs_rbfsvm_tfidf = GridSearchCV(rbfsvm_tfidf, param_grid, scoring='roc_auc', cv=10, verbose=2, n_jobs=-1)

In [None]:
gs_rbfsvm_tfidf.fit(x_train, y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.0min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('text',
                                                                         TfidfVectorizer(analyzer='word',
                                                                                         binary=False,
                                                                                         decode_error='strict',
                                                                                         dtype=<class 'numpy.float64'>,
                             

In [None]:
print('Best parameter set: %s ' % gs_rbfsvm_tfidf.best_params_)
print('CV AUC: %.3f' % gs_rbfsvm_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0} 
CV AUC: 0.922


In [None]:
clf = gs_rbfsvm_tfidf.best_estimator_
print('Test AUC: %.3f' % clf.score(x_test, y_test))

Test AUC: 0.889


In [None]:
pred = clf.predict(x_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

confusion matrix:
[[2452  206]
 [ 185  688]]
