In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import json
from collections import Counter

import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, PredefinedSplit,ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import ndcg_score, classification_report, f1_score
from tqdm import tqdm

In [2]:
# data_dir = '2021-07-19'

# method = 'scibert'
# # method = 'tfidf'
# # method = 'glove'

# # time_window = 1
# time_window = 6

# contain_zero = False
# # contain_zero = True

In [3]:
def classifier(model):
    if model == 'SVM':
        param_grid = {'kernel':['linear', 'poly', 'rbf', 'sigmoid',],
                      'tol':[1e-1, 1e-2,1e-3, 1e-4, 1e-5],
                      'C': [0.1, 1, 10, 100],  
                      'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'scale', 'auto']}
        return SVC, param_grid
    if model == 'Logistic':
        param_grid = [{'penalty':['l1', 'l2'],
                      'solver':['liblinear', 'saga'],
                      'tol':[1e-1, 1e-2,1e-3, 1e-4, 1e-5],
                      'C': [0.1, 1, 10, 100]},]
        return LogisticRegression, param_grid
    if model == 'DecisionTree':
        param_grid={
            'criterion':['gini','entropy'],
            'splitter':['best','random'],
            'max_depth':[2,3,4,5,6,7,8,9,10],
            'min_samples_split':[2,3,4,5,6,7,8,9,10],
            
        }
        return DecisionTreeClassifier,param_grid
    if model == 'RandomForest':
        param_grid={
            'n_estimators':[50,100,150,200,250,300],
            'min_samples_split':[2,3,4,5,6,7,8,9,10],
            'criterion':['gini','entropy'],
            'min_samples_split':[2,3,4,5,6,7,8,9,10],
        }
        return RandomForestClassifier,param_grid
    if model == 'GaussianNB':
        param_grid={
            'var_smoothing':[1e-9,1e-8,1e-7,1e-6,1e-5],
        }
        return GaussianNB,param_grid
    if model == 'MLP':
        param_grid={
            'hidden_layer_sizes':[4,8,16,32,64],
            'activation':['identity', 'logistic', 'tanh', 'relu'],
            'alpha':[1e-5,1e-4,1e-3,1e-2],
            'learning_rate_init':[1e-3,1e-2,1e-1],
            'early_stopping':[True],
            
        }
        return MLPClassifier,param_grid

In [4]:
def train_(data_dir,method,time_window,contain_zero,class_threshold,model_name,fw,subsampling = False):
    valid_samples = pd.read_csv(f'指标初步探索/valid_samples_{data_dir}_{method}_timewindow_{time_window}.csv',sep='\t')
#     print(valid_samples.shape)
    if not contain_zero:
        valid_samples = valid_samples.loc[valid_samples['count']>0]
#     print(valid_samples.shape)
    all_year_month = ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07','2020-08','2020-09','2020-10',
                     '2020-11','2020-12','2021-01','2021-02','2021-03']
    year_months_train = all_year_month[1:-time_window-1]
    year_months_test = all_year_month[-time_window-1:-time_window]
    
    
    valid_samples_train = valid_samples.loc[valid_samples['publish_year_month'].isin(year_months_train)]
    
#     print(valid_samples_train.shape)
    
    valid_samples_test = valid_samples.loc[valid_samples['publish_year_month'].isin(year_months_test)]
    valid_samples_train['clf_label'] = (valid_samples_train['count']>class_threshold).astype(int)
#     print(valid_samples_train.shape)
    exp_setting = {
        'method':method,
        'time_window':time_window,
        'contain_zero':contain_zero,
        'class_threshold':class_threshold,
        'model_name':model_name
    }
    
    if len(set(valid_samples_train['clf_label']))==1:
#         exp_setting = {
#             'method':method,
#             'time_window':time_window,
#             'contain_zero':contain_zero,
#             'class_threshold':class_threshold,
#             'model_name':model_name
#         }
        exp_setting['results'] = None
        return exp_setting
    
    if subsampling is True:
        majority_data = valid_samples_train[valid_samples_train['clf_label'] == 0]
        minority_data = valid_samples_train[valid_samples_train['clf_label'] == 1]
        majority_data = majority_data.sample(n=minority_data.shape[0],replace=False,random_state=0,axis=0)
        valid_samples_train = pd.concat([majority_data, minority_data],axis=0).sample(frac=1.,replace=False,random_state=0,axis=0)
    
    valid_samples_test['clf_label'] = (valid_samples_test['count']>class_threshold).astype(int)
    X_train, X_test = valid_samples_train[['min_cos_distance','min_cos_distance']].values, valid_samples_test[['min_cos_distance','min_cos_distance']].values
    Y_train, Y_test = valid_samples_train[['clf_label']].values.flatten(), valid_samples_test[['clf_label']].values.flatten()
    
    
    train_num, test_num = X_train.shape, X_test.shape
#     print(train_num, test_num)
    
    model, params_grid = classifier(model_name)
    best_score = 0
    best_params = None
    for params in tqdm(ParameterGrid(params_grid)):
        if model_name == 'GaussianNB':
            clf = model(**params)
        else:
            clf = model(**params,random_state=0)
#         print(Y_train)
        clf.fit(X_train, Y_train)
        score = f1_score(Y_test, clf.predict(X_test))
        if score>=best_score:
            best_score = score
            best_params = params
#         print(best_params)
    
#     fw.write(f"time_window-{time_window},contain_zero-{contain_zero},class_threshold-{class_threshold},model_name-{model_name}")
#     fw.write('\n')
#     print(best_params)
#     exit()
#     fw.write(json.dumps(best_params))
#     fw.write('\n')
    if model_name == 'GaussianNB':
        exp_setting['results'] = classification_report(Y_test,model(**best_params).fit(X_train, Y_train).predict(X_test),digits=4)
    else:
        exp_setting['results'] = classification_report(Y_test,model(**best_params,random_state=0).fit(X_train, Y_train).predict(X_test),digits=4,output_dict=True)
    return exp_setting
#     fw.write('\n')

In [5]:
results = {}
global_index = 0
for model_name in ['RandomForest']:# 'GaussianNB','Logistic','MLP','DecisionTree','SVM'
#     fw = open(f'{model_name}_{data_dir}_{method}.log','w',encoding='utf-8')
    for data_dir in ['2021-07-19']:
        for method in ['scibert','tfidf','glove']:
            for time_window in [1,6]:
                for contain_zero in [False,True]:
                    for class_threshold in [0,1,2,3]:
                        tmp = train_(data_dir,method,time_window,contain_zero,class_threshold,model_name,fw=None, subsampling=True)
                        results[global_index] = tmp
                        global_index += 1

100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:22<00:00,  4.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:18<00:00,  5.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:18<00:00,  5.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:52<00:00,  2.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:25<00:00,  4.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:22<00:00,  4.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:21<00:00,  5.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:53<00:00,  2.02it/s]
100%|███████████████████████████████████

In [6]:
results_df = pd.DataFrame(results).T

In [7]:
results_df[['accuracy', 'macro_avg', 'precision', 'recall', 'f1']] = 0.

In [8]:
for row_idx, row_content in results_df.iterrows():
    results = row_content['results']
    
    if results is None:
        accuracy, macro_avg, precision, recall, f1 = -1,-1,-1,-1,-1
    else:
        accuracy, macro_avg, precision, recall, f1 = \
            results['accuracy'], results['macro avg']['f1-score'], results['1']['precision'], results['1']['recall'], results['1']['f1-score']
    
    results_df.at[row_idx,'accuracy'] = accuracy
    results_df.at[row_idx,'macro_avg'] = macro_avg
    results_df.at[row_idx,'precision'] = precision
    results_df.at[row_idx,'recall'] = recall
    results_df.at[row_idx,'f1'] = f1

In [10]:
results_df.to_csv(f'final_results_{data_dir}.csv',index=False)

In [24]:
for metric in ['accuracy', 'macro_avg', 'precision', 'recall', 'f1']:
    print(f"*************************{metric}************************")
    for time_window in [1,6]:
        for contain_zero in [False,True]:
            for class_threshold in [0,1,2,3]:
                tmp = results_df.loc[(results_df['time_window']==time_window)
                                  &(results_df['contain_zero']==contain_zero)
                                  &(results_df['class_threshold']==class_threshold)
                              ]
                scibert = tmp.loc[(tmp['method']=='scibert')][metric].values[0]
                tfidf = tmp.loc[(tmp['method']=='tfidf')][metric].values[0]
                glove = tmp.loc[(tmp['method']=='glove')][metric].values[0]
    #             print(scibert)
                if scibert > tfidf and scibert > glove:
                    print(f"time_window:{time_window}, contain_zero:{contain_zero}, class_threshold:{class_threshold}")

*************************accuracy************************
*************************macro_avg************************
time_window:6, contain_zero:False, class_threshold:1
*************************precision************************
time_window:6, contain_zero:False, class_threshold:1
time_window:6, contain_zero:False, class_threshold:2
time_window:6, contain_zero:False, class_threshold:3
time_window:6, contain_zero:True, class_threshold:1
*************************recall************************
time_window:1, contain_zero:False, class_threshold:2
time_window:1, contain_zero:True, class_threshold:0
time_window:6, contain_zero:False, class_threshold:1
time_window:6, contain_zero:False, class_threshold:2
time_window:6, contain_zero:False, class_threshold:3
time_window:6, contain_zero:True, class_threshold:0
time_window:6, contain_zero:True, class_threshold:1
time_window:6, contain_zero:True, class_threshold:3
*************************f1************************
time_window:6, contain_zero:Fals