In [3]:
import pandas as pd
import numpy as np
import os 
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

## Preparation

In [4]:
data_folder = '../dataset'

In [5]:
def split_by_test_id(df, test_ids):
    
    test_set = df.iloc[test_ids]
    test_y = test_set[['education_type']]
    test_x = test_set.drop('education_type', axis=1)

    train_set = df.drop(test_ids, axis=0)
    train_y = train_set[['education_type']]
    train_x = train_set.drop('education_type', axis=1)
    
    return train_x, train_y, test_x, test_y

## Global Info

In [6]:
global_df = pd.read_csv(os.path.join(data_folder, 'feature_df_2.csv'))
with open(os.path.join(data_folder, 'test_node.txt')) as f:
    test_ids = [int(node.strip()) for node in f.readlines()]

train_x, train_y, test_x, test_y = split_by_test_id(global_df, test_ids)

In [8]:
# Linear Model -- Softmax

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
cv_val = cross_val_score(lr, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8761904761904762
[0.84731774 0.87052342 0.87327824 0.85931034 0.83287293]
0.8566605340102003


In [10]:
# XGBoost Classidier

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
y_pred = xgb_clf.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
cv_val = cross_val_score(xgb_clf, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8833333333333333
cv: [0.80055021 0.86501377 0.87878788 0.8662069  0.84116022]
cv: 0.8503437953532268


## Global with local Info

In [6]:
global_local_df = pd.read_csv(os.path.join(data_folder, 'feature_df_3.csv'))
with open(os.path.join(data_folder, 'test_node.txt')) as f:
    test_ids = [int(node.strip()) for node in f.readlines()]

train_x, train_y, test_x, test_y = split_by_test_id(global_local_df, test_ids)

In [7]:
# Linear Model -- Softmax

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
cv_val = cross_val_score(lr, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8761904761904762
cv: [0.84731774 0.87052342 0.87327824 0.85931034 0.83287293]
cv: 0.8566605340102003


In [8]:
# XGBoost Classidier

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
y_pred = xgb_clf.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
cv_val = cross_val_score(xgb_clf, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8833333333333333
cv: [0.80192572 0.86501377 0.87878788 0.8662069  0.83701657]
cv: 0.8497901692351453


In [None]:
## 这里看出local info 主要是引入了噪声

## Global with Shortest Path Mat

In [None]:
with open(os.path.join(data_folder, 'test_node.txt')) as f:
    test_ids = [int(node.strip()) for node in f.readlines()]
    
global_df = pd.read_csv(os.path.join(data_folder, 'feature_df_2.csv'))
    
shortest_paths = pd.read_csv(os.path.join(data_folder, 'shortest_path.csv'))
pca = PCA(n_components=10, svd_solver='randomized')
pca_spm = pca.fit_transform(shortest_paths)
pca_spm = pd.DataFrame(pca_spm)
# pca_spm['education_type'] = feature_global['education_type']

global_sp_df = pd.concat([global_df, pca_spm], axis=1)

In [11]:
train_x, train_y, test_x, test_y = split_by_test_id(global_sp_df, test_ids)

In [12]:
# Linear Model -- Softmax

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
cv_val = cross_val_score(lr, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8785714285714286
cv: [0.46217331 0.74793388 0.71349862 0.85793103 0.67265193]
cv: 0.6908377580129181


In [13]:
# XGBoost Classidier

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
y_pred = xgb_clf.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
cv_val = cross_val_score(xgb_clf, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.8833333333333333
cv: [0.57083906 0.86501377 0.87878788 0.8662069  0.81629834]
cv: 0.7994291913269931
