In [1]:
import os

import numpy as np
from pathlib import Path
import math
import dask.dataframe as dd
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

pd.set_option('max_colwidth', 0)

data_dir = Path('~/recsys2020')

In [2]:
train_cols = ['e{:03d}'.format(i) for i in range(768)]
classes = ['retweet','reply','like','retweet_with_comment']
target_cols = ["has_" + c for c in classes]
all_cols = train_cols + target_cols


In [4]:
df = pd.read_parquet(str(data_dir / 'training_set.parquet')).sample(frac=0.05, random_state=42)

num_pos = df[['has_retweet', 'has_like','has_reply','has_retweet_with_comment']].any(axis=1).sum(axis=0)
print(f"Training on {num_pos} pos and {len(df)-num_pos} neg samples")

print(f'Fitting classifier')
knn_c = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5, n_jobs=-1), n_jobs=-1)
knn_c.fit(df[train_cols].values, df[target_cols].values)

# ds = 'train'
# print(f'Predicting on dataset {ds}')
# print('Score: ' + str(knn_c.score(df_train_with_neg[train_cols].values, df_train_with_neg[target_cols].values)))

Training on 28699 pos and 10561 neg samples
Fitting classifier


MultiOutputClassifier(estimator=KNeighborsClassifier(n_jobs=-1), n_jobs=-1)

In [5]:
df_test = pd.read_parquet(str(data_dir / 'test_set.parquet')).sample(frac=0.1, random_state=42)

num_pos_test = df_test[['has_retweet', 'has_like','has_reply','has_retweet_with_comment']].any(axis=1).sum(axis=0)
print(f"Testing on {num_pos_test} pos and {len(df_test) - num_pos_test} neg samples")

ds = 'test'

Testing on 6347 pos and 3819 neg samples


In [8]:
result = knn_c.predict_proba(df_test[train_cols].values)


In [9]:
print(knn_c.classes_)
res_data = np.array([res[:,1] for res in result]).T
res_data

[array([False,  True]), array([False,  True]), array([False,  True]), array([False,  True])]


array([[0.4, 0. , 0.8, 0. ],
       [0.2, 0. , 0.8, 0. ],
       [0.2, 0. , 0.8, 0. ],
       ...,
       [0.4, 0. , 0.8, 0. ],
       [0.2, 0. , 0.6, 0. ],
       [0. , 0. , 0.6, 0. ]])

In [12]:
pred_cols = ['pred_retweet','pred_reply','pred_like','pred_retweet_with_comment']
res_df = pd.DataFrame.from_records(data=res_data, index=df_test.index, columns=pred_cols)
res_df = pd.concat([df_test[["tweet_id", 'has_retweet', 'has_reply', 'has_like', 'has_retweet_with_comment']], res_df], axis="columns")
res_df.head()

Unnamed: 0_level_0,tweet_id,has_retweet,has_reply,has_like,has_retweet_with_comment,pred_retweet,pred_reply,pred_like,pred_retweet_with_comment
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AD12506ABB019ECCB7A90A223AD0A10C,5D8E95415F590141F331274BB27E217D,True,False,True,False,0.4,0.0,0.8,0.0
D2ED851A1D31ECF9EB15917FA6F893D1,7D687F605F2AC1577FB90677E5AE9CD2,False,False,False,False,0.2,0.0,0.8,0.0
D195D2BC1AA730A9DE5F3CD31433A621,DEAB8A8F70346DA64CEA71585876D47C,False,False,False,False,0.2,0.0,0.8,0.0
C3306463711A4167989E9771697D86F9,4F92B332702CC635778651E07A3BF0CD,False,False,True,False,0.0,0.0,0.6,0.0
A18B2F44324EE2B5A0638FFAB438B9AF,0C6EFA042B5EAA543641F2F954713C3A,False,False,False,False,0.2,0.0,0.4,0.0


In [13]:
res_ds_name = 'knn_preds'
res_file = data_dir / f"{res_ds_name}.parquet/"
res_df.to_parquet(str(res_file))