#### Importing the Relevant Librairies 

In [1]:


import warnings
warnings.filterwarnings('ignore')

import os

import random

import numpy as np

import pandas as pd

os.environ['PYTHONHASHSEED']=str(1)

random.seed(1)

np.random.seed(1)


from tensorflow import keras


from scipy import stats

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from lightgbm import LGBMRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor,  StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn import metrics

import category_encoders as ce

from sklearn.model_selection import GridSearchCV

# Input Preprocessing & Feature Engineering

#### Preprocess the input

In [2]:
def preprocess_data(data): 
    #wheter there is a personal url or not
    data["Personal URL"] = 1 - data["Personal URL"].isna()
    
    #getting a month since creation column
    data["Profile Creation Timestamp"] = pd.to_datetime(data["Profile Creation Timestamp"])
    data["Months Since Creation"] = round(
        (max(pd.to_datetime(data["Profile Creation Timestamp"])).tz_localize(None)
         - pd.to_datetime(data["Profile Creation Timestamp"]).dt.tz_localize(None)).dt.days/30)
    
    #changing the verification status to a number
    data["Profile Verification Status"]  = np.where(data["Profile Verification Status"]=="Verified", 1, 0)
    
    data["Profile Cover Image Status"].fillna('NA', inplace=True)
    
    #We may want to select individual colours in our subset.
    data["Profile Theme Color"] = data["Profile Theme Color"].apply(lambda x: "thc-"+x if x in frequent_theme_colors else "others")
    data = data.merge(pd.get_dummies(data["Profile Theme Color"]), left_index = True, right_index = True)
    data["Profile Page Color"] = data["Profile Page Color"].apply(lambda x: "pc-"+x if x in frequent_page_colors else "others")
    data = data.merge(pd.get_dummies(data["Profile Page Color"]), left_index = True, right_index = True)
    data["Profile Text Color"] = data["Profile Text Color"].apply(lambda x: "txc-"+x if x in frequent_text_colors else "others")
    data = data.merge(pd.get_dummies(data["Profile Text Color"]), left_index = True, right_index = True)
    
    #bool to int because we want it to remain a OHE
    data["Is Profile View Size Customized?"] = data["Is Profile View Size Customized?"].astype(int)
    
    #remove capitalisation
    data["Location Public Visibility"]  = data["Location Public Visibility"].str.lower()
    
    #Locales are frequent elements from a bag of words extracted from Location.
    data["Location"] = data["Location"].fillna("unknown")
    locale_list = ['unknown',  'Canada', 'Chile', 'Jakarta', 'Arabia',  'Colombia', 'Venezuela',
       'Argentina', 'Australia','España',
       'England', 'France', 'Paris', 'London',
    'Worldwide',
       'Indonesia', 'México', 'United', 'California', 'Washington','Brasil', 'India', 'New York', 'Los Angeles']
    for locale in locale_list:
        data[locale] = data["Location"].apply(lambda x: locale.lower() in x.lower() if not pd.isna(x) else False).astype(int)
    
    #prepare location for target mean encoding
    def locale_cat(location):
        replacement = 'other'
        for x in locale_list:
            if x.lower() in location.lower():
                replacement = x.lower()
        return replacement
        
    data["Locale_TME"] = data["Location"].apply(locale_cat)
    
    #prepare language for target mean encoding
    for language in common_languages:
        data[language] = (data["User Language"]==language).astype(int)
        
    data["Language_TME"] = data["User Language"].apply(lambda x: x if x in common_languages else "others")
        
    #prepare timezones for target mean encoding
    for zone in time_zones:
        data["tz - " + zone] = (data["User Time Zone"] == zone).astype(int)
    
    data["TZ_TME"] = data["User Time Zone"].apply(lambda x: x if x in time_zones else "others")
    
    #Assuming we will be treating UTC Offset as categorical
    data["UTC Offset"] = data["UTC Offset"].fillna(0.5)
    data["UTC Offset"] = data["UTC Offset"].astype(str)+"aaa"

    #we noticed " " was different from "unknown" in terms of average/median
    data.loc[data["Profile Category"] == " ", "Profile Category"] = "empty"
    
    # for avg visit duration an cliks, infer nan with median per profile categories
    data["Avg Daily Profile Visit Duration in seconds"] = data.groupby(["Profile Category"])["Avg Daily Profile Visit Duration in seconds"].transform(
        lambda x: x.fillna(x.median()))
    data["Avg Daily Profile Clicks"] = data.groupby(["Profile Category"])["Avg Daily Profile Clicks"].transform(
        lambda x: x.fillna(x.median()))
    
    #estimated clicks = avg daily profile clicks times day, but we just did "times months", it's effectively the same after scaling
    data["Estimated Clicks"] = data["Avg Daily Profile Clicks"]*data["Months Since Creation"]
    
    # Log scales
    data["Num of Direct Messages"] = np.log(data["Num of Direct Messages"]+1)
    data["Num of Status Updates"] = np.log(data["Num of Status Updates"]+1)
    data["Num of Followers"] = np.log(data["Num of Followers"]+1)
    data["Num of People Following"] = np.log(data["Num of People Following"]+1)
    data["Avg Daily Profile Clicks"] = np.log(data["Avg Daily Profile Clicks"]+1)
    data["Estimated Clicks"] = np.log(data["Estimated Clicks"]+1)

    return data.copy()

#### Use the preprocess_data() function on the relevant collumn of the dataframe and select the relevant features.

In [3]:
target = "Num of Profile Likes"

df = pd.read_csv("train.csv", index_col = 0)
test = pd.read_csv("test.csv", index_col = 0)

#keeping the index for the final submission, in case order of indices matter
final_index = test.index

#sort indices because it's less confusing that way
df.sort_index(inplace=True)
test.sort_index(inplace=True)

#get most frequent colors/languages/etc
frequent_theme_colors = set(df["Profile Theme Color"].value_counts().head(10).index)
frequent_page_colors = set(df["Profile Page Color"].value_counts().head(10).index)
frequent_text_colors = set(df["Profile Text Color"].value_counts().head(10).index)
common_languages = df["User Language"].value_counts().head(20).index
time_zones = df["User Time Zone"].value_counts().head(25).index

#preprocessing
X = df.loc[:, df.columns.difference([target])]
y = df.loc[:, target]
X = preprocess_data(X.copy())
X_test = preprocess_data(test.copy())


#subset of handpicked features we care about
subset1 = ['Personal URL', 'Profile Cover Image Status',
          'Profile Verification Status', 'Is Profile View Size Customized?',
          'UTC Offset', 'Location Public Visibility', 'Num of Followers',
          'Num of People Following', 'Num of Status Updates',
          'Num of Direct Messages', 'Avg Daily Profile Clicks',
          'Months Since Creation',
          'Estimated Clicks', 'Profile Category',
           'Profile Theme Color', 'Locale_TME',
          "Language_TME", "TZ_TME"
         ]


X1 = X.loc[:, subset1]
X1_test = X_test.loc[:, subset1]

#### Function to preprocess the train and the validation set.  

In [4]:
#lim indicates outlier removal through winsorization. Ended up not used in final submission
def process_train_test_winz(X_train, y_train, X_test, lim = 0.0):
    
    scaler = RobustScaler()
    medians = X_train.median(axis=0).copy()
    X_train = X_train.fillna(medians)
    X_test = X_test.fillna(medians)
    
    cat_cols = X_train.select_dtypes(exclude=np.number).columns
    
    for cat in cat_cols:
        encoder = ce.TargetEncoder()
        X_train[cat] = encoder.fit_transform(X_train[cat], np.log(y_train.values+1))
        X_test[cat] = encoder.transform(X_test[cat])#In case of missing catedory (from X_train), the target mean.
        
    for col in ['Num of Followers',
          'Num of People Following', 'Num of Status Updates',
          'Num of Direct Messages', 'Avg Daily Profile Clicks']: 
        X_train[col] = stats.mstats.winsorize(X_train[col], limits=lim)
        X_test[col] = stats.mstats.winsorize(X_test[col], limits=lim)
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    
    
    return X_train, X_test

In [5]:
#Necessary imports and random seeding for reproducibility.

os.environ['PYTHONHASHSEED']=str(1)

random.seed(1)

np.random.seed(1)

x, x_test = process_train_test_winz(X1, y, X1_test)
estimators = [('lgb', LGBMRegressor(
                          learning_rate=0.02,
                          max_bin=135,
                          min_data_in_leaf=25,
                          num_iterations=500,
                          num_leaves=11,
                          reg_alpha=0.2,
                          min_child_samples=None)),
              ('svr', SVR(C=1.9, epsilon=0.8, kernel='rbf')),
              ('rf', RandomForestRegressor(
                         **{'bootstrap': True,
                            'ccp_alpha': 0.0,
                            'max_depth': 12,
                            'max_features': 'auto',
                            'max_leaf_nodes': None,
                            'min_impurity_decrease': 0.0,
                            'min_samples_leaf': 2,
                            'min_samples_split': 3,
                            'min_weight_fraction_leaf': 0.0,
                            'n_estimators': 100})),
              ('xgb', XGBRegressor(
                          objective='reg:squarederror',
                          **{'learning_rate' : 0.05,
                             'gamma': 0.1,
                             'max_depth': 5,
                             'min_child_weight': 1,
                             'subsample' : 1,
                             'n_estimators': 180,
                             'colsample_bytree' : 0.33,
                             'alpha': 2.5}))]

m = StackingRegressor(estimators=estimators, cv=50)
m.fit(x, np.log(y+1))
pred_train = (np.exp(m.predict(x))-1).clip(min=0)
pred_test = (np.exp(m.predict(x_test))-1).clip(min=0)
print("Training RMSLE :", np.sqrt(metrics.mean_squared_log_error(y.values, pred_train)))

Training RMSLE : 1.4131976664135983


#### Some Last Verification to make sure the predictions are in the right format and close to what we should expect.

In [6]:
print("Check diff of length in test inputs and preds:", len(X_test) - len(pred_test))
print("Median number of likes in training:  ", np.median(y))
print("Median number of likes in prediction:", np.median(pred_test))
result = test.copy()
result["Predicted"] = pred_test
result = result.loc[final_index]
print("Check min predicted value:", result[["Predicted"]].min())

Check diff of length in test inputs and preds: 0
Median number of likes in training:   1370.0
Median number of likes in prediction: 1362.7082050432873
Check min predicted value: Predicted    0.0
dtype: float64


#### Making the file into the dataframe for submission on Kaggle.

In [7]:
result[["Predicted"]].to_csv("pred.csv")