In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.cross_decomposition import CCA
from sklearn.manifold import spectral_embedding

In [2]:
# load data
graph_feat = np.loadtxt('release_1/graph.csv', delimiter=',')
description_feat = np.loadtxt('release_1/description.csv', delimiter=',')
social_feat = np.loadtxt('release_1/social_and_evolution.csv', delimiter=',')

semi_sup = pd.read_csv('release_1/success_partial_supervision.csv')

graph_feat.shape, description_feat.shape, social_feat.shape

((368586, 3), (1416322, 3), (1829, 200))

In [3]:
semi_sup

Unnamed: 0,index,is_successful
0,12,1
1,1419,1
2,865,1
3,146,0
4,1653,0
5,1176,0


In [4]:
# convert sparse matrices
description_feat = sparse.csr_matrix((description_feat[:, 2], (description_feat[:, 0], description_feat[:, 1])), shape=(1829, 8000))
graph_feat = sparse.csr_matrix((graph_feat[:, 2], (graph_feat[:, 0], graph_feat[:, 1])), shape=(1829, 146983197))

In [5]:
# get adjacent matrix between any two projects
adj = (graph_feat * graph_feat.T)
adj.data = (adj.data > 0).astype(np.int)
adj.shape

(1829, 1829)

In [6]:
# compute spectral embedding
lapl_embed = spectral_embedding(adj, n_components=1200, random_state=0, eigen_tol=1e-8)
lapl_embed.shape

(1829, 1200)

In [7]:
# do CCA between lapl embedding features and social features
cca = CCA(n_components=2, scale=False).fit(lapl_embed, social_feat)

In [8]:
# use the first dimention of CCA to predict
pred = (cca.x_scores_[:, 0] > 0).astype(int)

In [9]:
# flip the label based on semi-supervision
semi_mat = semi_sup.as_matrix()
if (pred[semi_mat[:, 0]] == semi_mat[:, 1]).mean() < 0.5:
    pred = 1 - pred

In [10]:
# save to output file
pred_df = pd.DataFrame({'index': range(1829), 'is_successful': pred.astype(int)})
pred_df.to_csv('predictions.csv', sep=',', index=False)

# Evaluation

In [11]:
labels = pd.read_csv('true_labels.csv')

In [12]:
public_indices = labels['Usage'] == 'Public'
private_indices = labels['Usage'] == 'Private'

In [13]:
print('Public score:', (pred_df['is_successful'] == labels['is_successful'])[public_indices].mean())
print('Private score:', (pred_df['is_successful'] == labels['is_successful'])[private_indices].mean())

Public score: 0.747264770241
Private score: 0.697267759563
