## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import itertools
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


from recsys_utils import recsys_load_training_df, recsys_evaluate, recsys_cv_split_single

%matplotlib inline
warnings.filterwarnings('ignore')

## Define Helper Functions

In [2]:
def replace_timestamp_with_probablity(data):
    return np.where(np.isnat(data),0.0,1.0)

## Load data

In [3]:
df = recsys_load_training_df('./RecSys20/Data/training_sample.tsv')

## Create new features

# Create dummy variables for present media feature
df = df.join(pd.get_dummies(df['present_media'].apply(pd.Series).stack()).sum(level=0))
df[['GIF', 'Photo', 'Video']] = df[['GIF', 'Photo', 'Video']].fillna(0)
df[['GIF', 'Photo', 'Video']] = df[['GIF', 'Photo', 'Video']].astype(int)
df.rename(columns = {'GIF':'present_media_gif', 'Photo':'present_media_photo', 'Video':'present_media_video'}, inplace=True)

# Create dummy variables for tweet type
df = pd.concat([df, pd.get_dummies(df['tweet_type'], prefidf='tweet_type')], adfis=1)

# Replace hashtags, present_links and present_domains with counts
df['hashtags'] = df['hashtags'].str.len()
df['present_links'] = df['present_links'].str.len()
df['present_domains'] = df['present_domains'].str.len()
df[['hashtags', 'present_links', 'present_domains']] = df[['hashtags', 'present_links', 'present_domains']].fillna(0)
df[['hashtags', 'present_links', 'present_domains']] = df[['hashtags', 'present_links', 'present_domains']].astype(int)

df.drop(columns=[], inplace=True)

df.drop(columns=[
    'tweet_type', 
    'present_media'
], inplace=True)

## Model training

In [4]:
### Logistic Regression

In [5]:
def feature_preparation(df):
    # Create dummy variables for present media feature
    df = df.join(pd.get_dummies(df['present_media'].apply(pd.Series).stack()).sum(level=0))
    df[['GIF', 'Photo', 'Video']] = df[['GIF', 'Photo', 'Video']].fillna(0)
    df[['GIF', 'Photo', 'Video']] = df[['GIF', 'Photo', 'Video']].astype(int)
    df.rename(columns = {'GIF':'present_media_gif', 'Photo':'present_media_photo', 'Video':'present_media_video'}, inplace=True)

    # Create dummy variables for tweet type
    df = pd.concat([df, pd.get_dummies(df['tweet_type'], prefix='tweet_type')], axis=1)

    # Replace hashtags, present_links and present_domains with counts
    df['hashtags'] = df['hashtags'].str.len()
    df['present_links'] = df['present_links'].str.len()
    df['present_domains'] = df['present_domains'].str.len()
    df[['hashtags', 'present_links', 'present_domains']] = df[['hashtags', 'present_links', 'present_domains']].fillna(0)
    df[['hashtags', 'present_links', 'present_domains']] = df[['hashtags', 'present_links', 'present_domains']].astype(int)

    # Replace timestamps with classes
    df['reply_prob'] = replace_timestamp_with_probablity(df['reply_timestamp'])
    df['retweet_prob'] = replace_timestamp_with_probablity(df['retweet_timestamp'])
    df['retweet_with_comment_prob'] = replace_timestamp_with_probablity(df['retweet_with_comment_timestamp'])
    df['like_prob'] = replace_timestamp_with_probablity(df['like_timestamp'])

    df.drop(columns=['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'], 
            inplace=True)

    df.drop(columns=[
        'tweet_type', 
        'present_media',
        'text_tokens', 
        'tweet_id', 
        'language', 
        'tweet_timestamp', 
        'engaged_with_user_id',
        'engaged_with_user_account_creation',
        'engaging_user_id',
        'engaging_user_account_creation'
    ], inplace=True)
    
    return df

In [6]:
def recommender_train_predict(df_train, df_test):
        
    df_train_prepared = feature_preparation(df_train)
    df_test_prepared = feature_preparation(df_test)
    
    y_train = df_train_prepared[['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob']]
    y_test = df_test_prepared[['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob']]
    
    X_train = df_train_prepared.drop(columns=['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob'])
    X_test = df_test_prepared.drop(columns=['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob'])

    model_reply = LogisticRegression()
    model_reply.fit(X_train,y_train['reply_prob'])
    reply_pred = model_reply.predict(X_test)
    
    model_retweet = LogisticRegression()
    model_retweet.fit(X_train,y_train['retweet_prob'])
    retweet_pred = model_retweet.predict(X_test)
    
    model_retweet_wc = LogisticRegression()
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    
    model_like = LogisticRegression()
    model_like.fit(X_train,y_train['like_prob'])
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [7]:
df_results_plain = recsys_evaluate(df, recommender_train_predict, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


### Logistic Regression (with scaling)

In [8]:
from sklearn.preprocessing import StandardScaler 

def logreg_preparation(df_train, df_test):
    
    df_train_prepared = feature_preparation(df_train)
    df_test_prepared = feature_preparation(df_test)
    
    y_train = df_train_prepared[['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob']]    
    X_train = df_train_prepared.drop(columns=['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob'])
    X_test = df_test_prepared.drop(columns=['reply_prob', 'retweet_prob', 'retweet_with_comment_prob', 'like_prob'])

    return X_train, X_test, y_train

def logreg_model_building(X_train, y_train):
    model_reply = LogisticRegression()
    model_reply.fit(X_train,y_train['reply_prob'])
    
    model_retweet = LogisticRegression()
    model_retweet.fit(X_train,y_train['retweet_prob'])
    
    model_retweet_wc = LogisticRegression()
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    
    model_like = LogisticRegression()
    model_like.fit(X_train,y_train['like_prob'])
    
    return model_reply, model_retweet, model_retweet_wc, model_like
    
def logreg_train_predict_ws(df_train, df_test):

    X_train, X_test, y_train = logreg_preparation(df_train, df_test)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    
    # build models for each interaction type
    model_reply, model_retweet, model_retweet_wc, model_like = logreg_model_building(X_train, y_train)
    # make predictions
    reply_pred = model_reply.predict(X_test)
    retweet_pred = model_retweet.predict(X_test)
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [9]:
df_results_scaled = recsys_evaluate(df, logreg_train_predict_ws, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


### Logistic Regression (C = 0.5)

In [10]:
def logreg_model_building_c05(X_train, y_train):
    model_reply = LogisticRegression(C=0.5)
    model_reply.fit(X_train,y_train['reply_prob'])
    
    model_retweet = LogisticRegression(C=0.5)
    model_retweet.fit(X_train,y_train['retweet_prob'])
    
    model_retweet_wc = LogisticRegression(C=0.5)
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    
    model_like = LogisticRegression(C=0.5)
    model_like.fit(X_train,y_train['like_prob'])
    
    return model_reply, model_retweet, model_retweet_wc, model_like

def logreg_train_predict_ws_c05(df_train, df_test):

    X_train, X_test, y_train = logreg_preparation(df_train, df_test)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    
    # build models for each interaction type
    model_reply, model_retweet, model_retweet_wc, model_like = logreg_model_building_c05(X_train, y_train)
    # make predictions
    reply_pred = model_reply.predict(X_test)
    retweet_pred = model_retweet.predict(X_test)
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [11]:
df_results_scaled_c05 = recsys_evaluate(df, logreg_train_predict_ws_c05, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


### Logistic Regression (C=10)

In [12]:
def logreg_model_building_c10(X_train, y_train):
    model_reply = LogisticRegression(C=10.0)
    model_reply.fit(X_train,y_train['reply_prob'])
    
    model_retweet = LogisticRegression(C=10.0)
    model_retweet.fit(X_train,y_train['retweet_prob'])
    
    model_retweet_wc = LogisticRegression(C=10.0)
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    
    model_like = LogisticRegression(C=10.0)
    model_like.fit(X_train,y_train['like_prob'])
    
    return model_reply, model_retweet, model_retweet_wc, model_like

def logreg_train_predict_ws_c10(df_train, df_test):

    X_train, X_test, y_train = logreg_preparation(df_train, df_test)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    
    # build models for each interaction type
    model_reply, model_retweet, model_retweet_wc, model_like = logreg_model_building_c10(X_train, y_train)
    # make predictions
    reply_pred = model_reply.predict(X_test)
    retweet_pred = model_retweet.predict(X_test)
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [13]:
df_results_scaled_c10 = recsys_evaluate(df, logreg_train_predict_ws_c10, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


In [14]:
def logreg_model_building_c01(X_train, y_train):
    model_reply = LogisticRegression(C=0.1)
    model_reply.fit(X_train,y_train['reply_prob'])
    
    model_retweet = LogisticRegression(C=0.1)
    model_retweet.fit(X_train,y_train['retweet_prob'])
    
    model_retweet_wc = LogisticRegression(C=0.1)
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    
    model_like = LogisticRegression(C=0.1)
    model_like.fit(X_train,y_train['like_prob'])
    
    return model_reply, model_retweet, model_retweet_wc, model_like

def logreg_train_predict_ws_c01(df_train, df_test):

    X_train, X_test, y_train = logreg_preparation(df_train, df_test)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    
    # build models for each interaction type
    model_reply, model_retweet, model_retweet_wc, model_like = logreg_model_building_c01(X_train, y_train)
    # make predictions
    reply_pred = model_reply.predict(X_test)
    retweet_pred = model_retweet.predict(X_test)
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [15]:
df_results_scaled_c01 = recsys_evaluate(df, logreg_train_predict_ws_c01, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


In [16]:
def logreg_model_building_c001(X_train, y_train):
    model_reply = LogisticRegression(C=0.01)
    model_reply.fit(X_train,y_train['reply_prob'])
    
    model_retweet = LogisticRegression(C=0.01)
    model_retweet.fit(X_train,y_train['retweet_prob'])
    
    model_retweet_wc = LogisticRegression(C=0.01)
    model_retweet_wc.fit(X_train,y_train['retweet_with_comment_prob'])
    
    model_like = LogisticRegression(C=0.01)
    model_like.fit(X_train,y_train['like_prob'])
    
    return model_reply, model_retweet, model_retweet_wc, model_like

def logreg_train_predict_ws_c001(df_train, df_test):

    X_train, X_test, y_train = logreg_preparation(df_train, df_test)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    
    # build models for each interaction type
    model_reply, model_retweet, model_retweet_wc, model_like = logreg_model_building_c001(X_train, y_train)
    # make predictions
    reply_pred = model_reply.predict(X_test)
    retweet_pred = model_retweet.predict(X_test)
    retweet_wc_pred = model_retweet_wc.predict(X_test)
    like_pred = model_like.predict(X_test)
    
    n = len(reply_pred)
    
    for i in range(0,n):
        pred_reply = reply_pred[i]
        pred_retweet = retweet_pred[i]
        pred_retweet_wc = retweet_wc_pred[i]
        pred_like = like_pred[i]
        
        yield pred_reply, pred_retweet, pred_retweet_wc, pred_like

In [17]:
df_results_scaled_c001 = recsys_evaluate(df, logreg_train_predict_ws_c001, 'all')

Run RecSys recommender evaluation:
> cv-split (single_random)
.
> cv-split (tweetid)
..........
> cv-split (userid)
..........
> cv-split (time)
..........


In [18]:
# Results without scaling, C=1.0
df_results_plain

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.636793,0.513122,0.557537,0.503701
tweetid,0.652951,0.512665,0.558783,0.503701
userid,0.648163,0.512648,0.558577,0.503689
time,0.640275,0.512384,0.558608,0.50361


In [19]:
# Results with scaling, C=1.0
df_results_scaled

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.665161,0.513122,0.558184,0.503701
tweetid,0.667883,0.51267,0.184291,0.503702
userid,0.668312,0.51266,0.302348,0.453715
time,0.667762,0.512384,0.167484,0.45361


In [20]:
# Results with scaling, C=10.0
df_results_scaled_c10

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.665161,0.513122,0.558184,0.503701
tweetid,0.667783,0.512669,0.171784,0.503702
userid,0.667058,0.512695,0.345075,0.453707
time,0.667786,0.512384,0.167484,0.40361


In [21]:
# Results with scaling, C=0.5
df_results_scaled_c05

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.665161,0.513122,0.558184,0.503701
tweetid,0.668487,0.51267,0.150141,0.503701
userid,0.667754,0.512662,0.270939,0.453717
time,0.667822,0.512384,0.167484,0.45361


In [22]:
# Results with scaling, C=0.1
df_results_scaled_c01

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.665048,0.513122,0.558184,0.503701
tweetid,0.668342,0.512669,0.16931,0.503701
userid,0.667059,0.512653,0.243049,0.453715
time,0.667651,0.512384,0.167484,0.45361


In [23]:
# Results with scaling, C=0.01
df_results_scaled_c001

Unnamed: 0_level_0,PRAUC(like),PRAUC(reply),PRAUC(retweet),PRAUC(retweet_wc)
split_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_random,0.665854,0.513122,0.558184,0.503701
tweetid,0.668334,0.512669,0.1318,0.503702
userid,0.667017,0.512663,0.302508,0.453699
time,0.668234,0.512384,0.217484,0.45361
