In [40]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy.stats import kurtosis
from sklearn.model_selection import train_test_split
import seaborn as sns

In [53]:
def preprocess(filename):
    data = np.load(filename)
    X = data['X']
    X = pd.DataFrame(X)
    X.rename(columns={0:'user', 1:'item', 2:'rating'}, inplace=True)
    X.sort_values(by=['user'], inplace=True)
    
    list_of_user_feature_data = []
    for user_id in sorted(X.user.unique()):
        user_feature_dict = dict()
        user_feature_dict['user'] = user_id
        
        rating_interactions_df = X[X['user'] == user_id]
        rating_interactions_df.loc[:, 'item'] = rating_interactions_df['item'].astype(str)
        rating_interactions_dict = dict(zip(rating_interactions_df['item'], rating_interactions_df['rating']))
        user_feature_dict.update(rating_interactions_dict)
        
        # manually engineer features
        # get the ratings from user with user id == user_id
        user_ratings = X[X['user'] == user_id]['rating']
        user_feature_dict['sum_of_ratings'] = sum(user_ratings)
        user_feature_dict['avg_of_ratings'] = np.mean(user_ratings)
        user_feature_dict['total_num_ratings'] = len(user_ratings)
        user_feature_dict['variance_of_ratings'] = np.var(user_ratings)
        user_feature_dict['std_dev_of_ratings'] = np.std(user_ratings)
        
        user_feature_dict['count_positives'] = sum(rating > 0 for rating in user_ratings)
        user_feature_dict['count_negatives'] = sum(rating < 0 for rating in user_ratings)
        user_feature_dict['count_neutrals'] = sum(rating == 0 for rating in user_ratings)
        
        user_feature_dict['proportion_of_positives'] = user_feature_dict['count_positives'] / user_feature_dict['total_num_ratings']
        user_feature_dict['proportion_of_negatives'] = user_feature_dict['count_negatives'] / user_feature_dict['total_num_ratings']
        user_feature_dict['proportion_of_neutrals'] = user_feature_dict['count_neutrals'] / user_feature_dict['total_num_ratings']
        user_feature_dict['outlier_count_kurtosis'] = kurtosis(user_ratings)
        list_of_user_feature_data.append(user_feature_dict)
        
    ret_df = pd.DataFrame(list_of_user_feature_data)
    
    # features to create
    X_item_attrs = X.groupby(['item']).agg({'rating': ['count', 'mean', 'median']})
    
    # flatten columns and rename
    X_item_attrs.columns = [f"{agg_type}_{agg_func}" for agg_func, agg_type in X_item_attrs.columns]
    X_item_attrs = X_item_attrs.rename(columns={'count_rating': 'rating_count', 'mean_rating': 'average_rating'}).reset_index()
    
    X['item_rating_count'] = X.apply(lambda row: X_item_attrs[X_item_attrs['item'] == row['item']]['rating_count'].item(), axis=1)
    X['item_average_rating'] = X.apply(lambda row: X_item_attrs[X_item_attrs['item'] == row['item']]['average_rating'].item(), axis=1)
    X['item_median_rating'] = X.apply(lambda row: X_item_attrs[X_item_attrs['item'] == row['item']]['median_rating'].item(), axis=1)

    X['rating_deviation_from_item_average_rating'] = abs(X['rating'] - X['item_average_rating'])
    X['rating_deviation_from_item_median_rating'] = abs(X['rating'] - X['item_median_rating'])
    X['rating_above_item_median_rating'] = (X['rating'] - X['item_median_rating']).clip(0, None)

    average_item = X.groupby(['user']).agg({'item_rating_count':'mean'})
    average_item = average_item.rename(columns={'item_rating_count':'avg_item_rating_count'})

    average_item['avg_item_avg_rating'] = X.groupby(['user']).agg({'item_average_rating':'mean'})

    average_item['avg_item_median_rating'] = X.groupby(['user']).agg({'item_median_rating':'mean'})

    average_item['avg_dev_from_item_median_rating'] = X.groupby(['user']).agg({'rating_deviation_from_item_median_rating':'mean'})
    average_item['avg_rating_above_item_median_rating'] = X.groupby(['user']).agg({'rating_above_item_median_rating':'mean'})

    average_item = average_item.reset_index()

    ret_df = ret_df.merge(average_item, on=['user'], how='left')
        
    ret_df = ret_df.drop(columns=['avg_item_rating_count'])
    
    ret_df = ret_df.sort_values('user')
    print('User order:', ret_df['user'])
    ret_df = ret_df.drop(columns=['user'])
    
    return ret_df

In [54]:
def get_training_data(train_filename):
    X = preprocess(train_filename)
    data=np.load(train_filename)
    yy=data["yy"]
    yy=pd.DataFrame(yy)
    yy.rename(columns={0:"user",1:"label"},inplace=True)
    yy.sort_values(by='user', inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X, yy['label'], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


In [55]:
train_filename = 'first_batch_multi_labels.npz'
X_train, X_test, y_train, y_test = get_training_data(train_filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_interactions_df.loc[:, 'item'] = rating_interactions_df['item'].astype(str)


User order: 0          0
1          1
2          2
3          3
4          4
        ... 
1095    1095
1096    1096
1097    1097
1098    1098
1099    1099
Name: user, Length: 1100, dtype: int64


In [56]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [57]:
X_train_copy = X_train.copy(deep=True)
X_test_copy = X_test.copy(deep=True) 

In [58]:
X_train_copy_cols = X_train_copy.columns.tolist()
X_test_copy_cols = X_test_copy.columns.tolist()

In [59]:
X_train_cols = X_train.columns.tolist()

X_test = X_test[X_train_cols]

In [60]:
print(X_train_scaled.shape, X_test_scaled.shape)
print(X_train_scaled.dtype, X_test_scaled.dtype)

(880, 1032) (220, 1032)
float64 float64


In [61]:
scaler = StandardScaler()

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

dtrain = xgb.DMatrix(X_train_scaled, label=y_train)

# Set parameters
parameters = {
    'max_depth': 2,
    'eta': 0.3,
    'objective': 'multi:softprob',
    'eval_metric': 'auc',
    'num_class': 3,
}

# Train 
num_round = 1000
bst = xgb.train(parameters, dtrain, num_round, verbose_eval=False)

dpred = xgb.DMatrix(X_test_scaled)

y_pred_probs = bst.predict(dpred)

In [62]:
y_pred_classes = np.argmax(y_pred_probs, axis=1)

In [63]:
accuracy = accuracy_score(y_test, y_pred_classes)
accuracy

0.95

In [64]:
auc = roc_auc_score(y_test, y_pred_probs, multi_class='ovr')
auc

0.982252861350977