In [2]:
import pandas as pd
import numpy as np
import os 
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

In [31]:
def split_by_test_id(df, test_ids):
    
    test_set = df.iloc[test_ids]
    test_y = test_set[['education_type']]
    test_x = test_set.drop('education_type', axis=1)

    train_set = df.drop(test_ids, axis=0)
    train_y = train_set[['education_type']]
    train_x = train_set.drop('education_type', axis=1)
    
    return train_x, train_y, test_x, test_y

## Load Data

In [5]:
data_folder = '../dataset'

In [5]:
adj_matrix = pd.read_csv(os.path.join(data_folder, 'total_matrix.csv'))

pca = PCA(svd_solver='randomized')  # n_components=min(n_samples, n_features)
pca.fit(adj_matrix)
ratio = pca.explained_variance_ratio_

In [17]:
n_count = 870
sum_variance = sum(ratio[:870])
for r in ratio[870:]:
    sum_variance += r
    n_count += 1
    if sum_variance > 0.9:
        break
print(n_count)
print(sum_variance)

877
0.9001068341588159


In [18]:
pca = PCA(n_components=877, svd_solver='randomized')
pca_adj_mat = pca.fit_transform(adj_matrix)

pca_am_df = pd.DataFrame(pca_adj_mat)

In [33]:
feature_global = pd.read_csv(os.path.join(data_folder, 'feature_df_2.csv'))

In [43]:
pca_am_df['education_type'] = feature_global['education_type']
# len(pca_am_df[pca_am_df['education_type'] ==0])

1031

In [27]:
with open(os.path.join(data_folder, 'test_node.txt')) as f:
    test_ids = [int(node.strip()) for node in f.readlines()]

In [None]:
# test_set = pca_am_df.iloc[test_ids]
# test_y = test_set[['education_type']]
# test_x = test_set.drop('education_type', axis=1)

# train_set = pca_am_df.drop(test_ids, axis=0)
# train_y = train_set[['education_type']]
# train_x = train_set.drop('education_type', axis=1)

train_x, train_y, test_x, test_y = split_by_test_id(pca_am_df, test_ids)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Baseline 

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [64]:
lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
lr.fit(X_train, y_train)

In [68]:
y_pred = lr.predict(X_test)
accu = accuracy_score(y_test, y_pred)
print(accu)

0.571625344352617


In [74]:
np.average(cross_val_score(lr, train_x, train_y, cv=5))

0.37082837279160646

### XGBOOST

In [81]:
xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
accu = accuracy_score(y_test, y_pred)

In [82]:
accu

0.6253443526170799

In [83]:
xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
np.average(cross_val_score(xgb_clf, train_x, train_y, cv=5))

0.4962693889761476

# Shortest Path Mat

In [40]:
shortest_paths = pd.read_csv(os.path.join(data_folder, 'shortest_path.csv'))

In [41]:
pca = PCA(svd_solver='randomized')  # n_components=min(n_samples, n_features)
pca.fit(shortest_paths)
ratio = pca.explained_variance_ratio_

In [42]:
np.sum(ratio[:5])

0.9770768841021783

In [43]:
pca = PCA(n_components=10, svd_solver='randomized')
pca_spm = pca.fit_transform(shortest_paths)
pca_spm = pd.DataFrame(pca_spm)
pca_spm['education_type'] = feature_global['education_type']

In [44]:
train_x, train_y, test_x, test_y = split_by_test_id(pca_spm, test_ids)

In [45]:
# Linear Model -- Softmax

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

lr = LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial')
cv_val = cross_val_score(lr, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.6547619047619048
cv: [0.08390646 0.64049587 0.62534435 0.6262069  0.07596685]
cv: 0.410384086538095


In [46]:
# XGBoost Classidier

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
y_pred = xgb_clf.predict(test_x)
accu = accuracy_score(test_y, y_pred)
print('train_test:', accu)

xgb_clf = XGBClassifier(max_depth=3,
                        n_estimators=1000,  
                        objective='multi:softprob',
                        seed=0, 
                        silent=True, 
                        learning_rate=0.001)
xgb_clf.fit(train_x, train_y)
cv_val = cross_val_score(xgb_clf, train_x, train_y, cv=5)
print('cv:', cv_val)
print('cv:', np.average(cv_val))

train_test: 0.6642857142857143
cv: [0.38927098 0.62396694 0.62534435 0.63034483 0.3480663 ]
cv: 0.5233986794621639
