# This notebook is created based on the ideas, discussion and effort of Balaji, Mimi and Arjun

In [None]:
# Import libraries
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import os
import random
import warnings
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor

In [None]:
# Set random seeds (for reproducibility requirement)
os.environ['PYTHONHASHSEED']=str(1)
np.random.seed(1)
random.seed(1)

# 1. Loading the Datasets

In [None]:
# Load data
train = pd.read_csv('../input/ift6758-a20/train.csv')
test = pd.read_csv('../input/ift6758-a20/test.csv')

In [None]:
tid=test['Id']
test_id=tid.to_numpy()

In [None]:
train_x=train.iloc[:,:24]
train_y=train.iloc[:,23]

In [None]:
train_loc=train_x.copy()
train_loc.head()

In [None]:
train_loc.columns

**Columns**

Id - Useful for submission, Removed during training

User Name - Removed

Personal URL - 0 for Nan,1 otherwise

User Name - The screen name of the user

Profile Cover Image Status - 0/1

Profile Verification Status - 0/1

Profile Text Color - Removed

Profile Page Color - Removed

Profile Theme Color - Removed

Is Profile View Size Customized? - 0/1

UTC Offset - Removed

Location - Removed

Location Public Visibility - 0/1

User Language - converted to few unique values

Profile Creation Timestamp - converted to months from date

User Time Zone - Converted to 7 values

Num of Followers - log transform

Num of People Following - log transform

Num of Status Updates - log transform

Num of Direct Messages - log transform

Profile Category - label encoding

Avg Daily Profile Visit Duration in seconds - log transform

Avg Daily Profile Clicks - log transform

Profile Image - Removed

Num of Profile Likes - Label

# 2. Data Preprocessing and Engineering

In [None]:
#Removing 8 columns that are not being used
def drop_columns(df):
    df.drop(['Id','User Name','Location','UTC Offset','Profile Image','Profile Text Color',
               'Profile Page Color','Profile Theme Color'],axis=1,inplace=True)

# 15 feature columns are left after these columns are removed

In [None]:
drop_columns(train_loc)
drop_columns(test)

In [None]:
time_zone_dict = {
'Eastern Time (US & Canada)':'USA',
'Pacific Time (US & Canada)':'USA',
'Central Time (US & Canada)':'USA',
'Central Time (US & Canada)':'USA',
'London':'Europe',
'Brasilia':'Latin America',
'Paris':'Europe',
'Quito':'Latin America',
'Jakarta':'Asia',
'Amsterdam':'Europe',
'Mexico City':'Europe',
'Madrid':'Europe',
'New Delhi':'Asia',
'Istanbul':'Middle East',
'Hawaii':'USA',
'Tokyo':'Asia',
'Rome':'Europe',
'Santiago':'Latin America',
'Greenland':'Europe',
'Buenos Aires':'Europe',
'Mountain Time (US & Canada)':'USA',
'Riyadh':'Middle East',
'Caracas':'Latin America',
'Athens':'Europe',
'Atlantic Time (Canada)':'USA',
'Bern':'Europe',
'Alaska':'USA',
'Arizona':'USA',
'Bogota':'Latin America',
'Mumbai':'Asia',
'India':'Asia',
'Berlin':'Europe',
'Hong Kong':'Asia',
'Seoul':'Asia',
'Pretoria':'Africa',
'Sydney':'Asia',
'Muscat':'Middle East',
'Baghdad':'Middle East',
'Dublin':'Europe',
'Berlin':'Europe',
'Casablanca':'Africa',
'Cairo':'Africa',
'Abu Dhabi':'Middle East',
'Chennai':'Asia',
'Kuwait':'Middle East',
'Kuala Lumpur':'Asia',
'Brussels':'Europe',
'Moscow':'Asia',
'Central America':'Latin America',
'Ljubljana':'Europe',
'Singapore':'Asia',
'Melbourne':'Asia'}

def location_fix(df):
    '''
    this function is to replace city with continent
    '''
    for i in time_zone_dict.items():
        df['User Time Zone'] = df['User Time Zone'].replace(i[0], i[1])

    top_used_loc=['USA','Europe','Latin America','Asia','Middle East','Africa']
    df['User Time Zone'][~df['User Time Zone'].isin(top_used_loc)]='Others'

In [None]:
location_fix(train_loc)
location_fix(test)

In [None]:
def preprocessing_num(df):
    # Converting personal url to binary
    df['Personal URL'].fillna(0,inplace=True)
    df['Personal URL'][df['Personal URL']!=0]=1

    # Converting '??' from the Location Public Visibility to enabled
    df['Location Public Visibility']=df['Location Public Visibility'].str.lower()
    df['Location Public Visibility']=df['Location Public Visibility'].replace('??','enabled')
    
    # These four languages are the most common. Other languages are converted to 'others'
    top_used_lang=['en','es','pt','fr']
    df['User Language'][~df['User Language'].isin(top_used_lang)]='others'

    # ' ' value in Profile Category  column is converted to 'unkown'
    df['Profile Category']=df['Profile Category'].replace(' ','unknown')
    
    # Here we do a log transform for four continuous valued inputs to remove the skew in the features and 
    # get feature values that resembles a normal distribution.

    df['Num of Followers']= np.log10(1+df['Num of Followers'])
    df['Num of People Following']= np.log10(1+df['Num of People Following'])
    df['Num of Status Updates']= np.log10(1+df['Num of Status Updates'])
    df['Num of Direct Messages']= np.log10(1+df['Num of Direct Messages'])

    
    # We do a log transform of the 'Avg Daily Profile Visit Duration in seconds' column and also impute the 
    # NaN values by the mean value of the column.
    df['Avg Daily Profile Visit Duration in seconds']=np.log10(1+df['Avg Daily Profile Visit Duration in seconds'])
    df['Avg Daily Profile Visit Duration in seconds'].fillna((df['Avg Daily Profile Visit Duration in seconds'].mean()), inplace=True)

    # Same procedure is done for 'Avg Daily Profile Clicks' column also
    df['Avg Daily Profile Clicks']= np.log10(1+df['Avg Daily Profile Clicks'])
    df['Avg Daily Profile Clicks'].fillna((df['Avg Daily Profile Clicks'].mean()), inplace=True)

    # We fill the NaN values in 'Profile Cover Image Status' column by 'Not set'
    df['Profile Cover Image Status'].fillna('Not set',inplace=True)


In [None]:
preprocessing_num(train_loc)
preprocessing_num(test)

In [None]:
def preprocessing_category(df):


    # Now we convert the categorical column values from text form to numerical form to input it to the model
    cleanup_nums = {"Personal URL": {"0":0, "1":1},
                "Profile Cover Image Status":     {"Not set": 0, "Set": 1},
                "Profile Verification Status": {"Not verified": 0, "Pending": 1, "Verified": 2 },
                "Is Profile View Size Customized?":{"False":0,"True":1},
                "Location Public Visibility":{'disabled':0,'enabled':1},
                "Profile Category":{'unknown':0,'government':1,"business":2,'celebrity':3},
                "User Time Zone":{'Others':0,'Africa':1,'Middle East':2,'Asia':3,'Latin America':4,'Europe':5,'USA':6},
                'User Language':{'others':0,'fr':2,'pt':3,'es':4,'en':5}
               }

    # Converting the data type of the categorical columns to 'str'
    df['Profile Cover Image Status'] = df['Profile Cover Image Status'].astype(str)
    df['Profile Verification Status'] = df['Profile Verification Status'].astype(str)
    df['Is Profile View Size Customized?'] =df['Is Profile View Size Customized?'].astype(str)
    df['Location Public Visibility'] = df['Location Public Visibility'].astype(str)

    df = df.replace(cleanup_nums)
    return df


In [None]:
train_loc=preprocessing_category(train_loc)
test=preprocessing_category(test)

Add extra columns from existing features

- We extract month in social media from `Profile Creation Timestamp` and store it as `MonthsInSocialMedia`.
- A new column, `Months follower ratio` is from `Num of Followers` divided by `MonthsInSocialMedia`.
- A new column, `Months following ratio` is from `Num of People Following` divided by `MonthsInSocialMedia`.
- A new column, `Months status ratio` is from `Num of Status Updates` divided by `MonthsInSocialMedia`.
- A new column, `Months messages ratio` is from `Num of Direct Messages` divided by `MonthsInSocialMedia`.
- A new column, `group_sum` is from taking summation of other numerical features, and then divided by 6 to adjust the scale.
- A new column, `Total Activity` is from adding `Num of Status Updates` and `Num of Direct Messages` together.
- A new column `Total clicks from inception` is from multiply number of days in a month (30) to `Avg Daily Profile Clicks` and `MonthsInSocialMedia`.

In [None]:
def new_columns(df):
    # Convert the time stamp column into a new column that represents the number of months 
    # the person has been on social media
    df['Profile Creation Timestamp'] = df['Profile Creation Timestamp'].astype(str)
    df['Profile Creation Timestamp'] =pd.to_datetime(df['Profile Creation Timestamp'])
    df['MonthsInSocialMedia'] = ((2020- df['Profile Creation Timestamp'].dt.year) * 12 +
    (11 - df['Profile Creation Timestamp'].dt.month))
        ### new 
    df['MonthsInSocialMedia'] =np.log10(1+df['MonthsInSocialMedia'])    
        
    df['Months follower ratio']=df['Num of Followers']/df['MonthsInSocialMedia']
    df['Months following ratio']=df['Num of People Following']/df['MonthsInSocialMedia']
    df['Months status ratio']=df['Num of Status Updates']/df['MonthsInSocialMedia']
    df['Months messages ratio']=df['Num of Direct Messages']/df['MonthsInSocialMedia']
    group_col = df[['Num of Followers', 'Num of People Following', 'Num of Status Updates', 'Num of Direct Messages','Avg Daily Profile Visit Duration in seconds', 'Avg Daily Profile Clicks']]
    df['group_sum'] = np.sum(group_col, axis=1)
    df['group_sum']=df['group_sum']/6
    
    df['Total Activity']=df['Num of Status Updates']+df['Num of Direct Messages']
    df['Total clicks from inception']=df['Avg Daily Profile Clicks']*30*train_loc['MonthsInSocialMedia']
#     df['Num of People Following'][df['Num of People Following']<=0]=0.1
#     df['Followers Ratio']=df['Num of Followers']/df['Num of People Following']

In [None]:
new_columns(train_loc)
new_columns(test)

We drop the 'Profile Creation Timestamp' because we extracted the useful information from this column and stored it in 'MonthsInSocialMedia' column.

In [None]:

train_loc.drop('Profile Creation Timestamp',axis=1,inplace=True)
test.drop('Profile Creation Timestamp',axis=1,inplace=True)

Drop Label

In [None]:
train_loc.drop(['Num of Profile Likes'],axis=1,inplace=True)


In [None]:
# Training data has been stored in a different variable for convinience.
train_ax =train_loc.copy()
train_ay = train_y.copy()

In [None]:
fit_x_all = train_ax
fit_y_all = np.log10(1+train_ay)
pred = test

In [None]:
svr = SVR(kernel='rbf', epsilon=0.2,C=0.75)

xgboost = XGBRegressor(learning_rate=0.03,
                       n_estimators=250,
                       max_depth=3,
                       seed=27,
                       alpha=2,
                       random_state=1)

xgboost2 = XGBRegressor(learning_rate=0.03,
                       n_estimators=200,
                       max_depth=4,
                       seed=400,
                       alpha=0,
                       random_state=34)


svr2 = SVR(kernel='rbf', epsilon=0.1,C=0.75)

In [None]:
stack = StackingCVRegressor(regressors=(xgboost, xgboost2,svr2, svr),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True,random_state=15)

In [None]:
stack_te = make_pipeline(StandardScaler(), stack).fit(fit_x_all, fit_y_all)
test_predl = stack_te.predict(pred)
test_pred =(10**test_predl) - 1
test_pred[test_pred < 0] = 0
output = np.round_(test_pred)

In [None]:
sub =  open('final_stack_xgboost_svr.csv','w+')
sub.write('Id,Predicted\n')
for index, prediction in zip(test_id,output):
    sub.write(str(index) + ',' + str(prediction) + '\n')
sub.close()