In [3]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter, defaultdict

In [4]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
all_train_tokens = list(chain.from_iterable(train_data_tokens))

# Get labels
y_train = train.label.values
y_val = val.label.values

#### User Features

In [9]:
from scipy.sparse import csr_matrix, coo_matrix, hstack

def combine_features(user_features, review_features):
    return hstack( [user_features, review_features], format='csr' )

In [10]:
def get_user_features(dataset):
    users = dataset['user_id']
    users_fake_cnts = Counter(train[train['label'] == 1]['user_id']) # 有过fake的users & 几次
    users_feature = [users_fake_cnts[user] if user in users_fake_cnts else 0 for user in users]
    return np.array(users_feature).reshape([len(users_feature), 1])

In [11]:
train_users_features = get_user_features(train)
val_users_features = get_user_features(val)

## Explore

In [13]:
users = val['user_id']
users_fake_cnts = Counter(train[train['label'] == 1]['user_id'])

In [14]:
users_feature = [users_fake_cnts[user] if user in users_fake_cnts else 0 for user in users]

In [16]:
np.array(users_feature).reshape([len(users_feature), 1])

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [7]:
from my_utils import *

In [8]:
u = UserFeature()
u.fit(train)

<my_utils.UserFeature at 0x1a604b9690>

In [9]:
u.get_user_features(train)

[(26.0, 0),
 (1.0, 0),
 (2.0, 0),
 (1.0, 0),
 (3.0, 0),
 (1.0, 0),
 (3.0, 0),
 (22.0, 0),
 (1.0, 0),
 (1.0, 0),
 (1.0, 0),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (9.0, 0),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291250119013615, 1),
 (1.2291