<a href="https://colab.research.google.com/github/MrJangoBox/Team-13-Padelol-Kaggle-IFT-6578/blob/main/Public_Score_1_66395_IFT_6578_Kaggle_Before_Shift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import gdown
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer, FunctionTransformer
from sklearn.svm import SVR

### Load data

In [None]:
# Load data
if 'google.colab' in str(get_ipython()):
    files = []
    files.append(('test.csv',
                'https://drive.google.com/uc?id=1gu6DXcEr4agiGttFj2ubToSICl5NDGKw'))
    files.append(('train.csv',
                'https://drive.google.com/uc?id=1x-Baw1riR-7wKZTpAvXe9RCCC0zu1sXv'))

    for file in files:
        gdown.download(file[1], file[0], quiet=True)

    train_data, test_data = pd.read_csv('train.csv'), pd.read_csv('test.csv')
else:
    train_data, test_data = pd.read_csv('../train.csv'), pd.read_csv('../test.csv')

In [None]:
# Rename columns
train_cols = train_data.columns.str.lower().str.replace(' ', '_').str.replace('?', '')
test_cols = train_cols[:-1]
train_data.columns = train_cols
test_data.columns = test_cols
# Split train dataset to train and validation
X_train, X_valid, y_train, y_valid = train_test_split(train_data.iloc[:, :-1],
                                                      train_data.iloc[:, -1],
                                                      test_size=0.3,
                                                      shuffle=True,
                                                      random_state=76)

### Loss

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

### Reproduce initial results

In [None]:
subset_features = ['num_of_status_updates']
x_train = X_train[subset_features]
x_valid = X_valid[subset_features]
regr = LinearRegression().fit(x_train, y_train)
predictions = regr.predict(x_valid)
print(np.sqrt(mean_squared_log_error(y_valid, predictions)))
rmsle(y_valid, predictions)

3.189712405243282


3.189712405243282

### Column transformers

## Numeric Features

In [None]:
numeric_features = ['num_of_followers', 'num_of_people_following',
                    'num_of_status_updates', 'num_of_direct_messages',
                    'avg_daily_profile_visit_duration_in_seconds',
                    'avg_daily_profile_clicks']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler())])

## Categorical Features

In [None]:
categorical_features = ['user_language', 'user_time_zone']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
def clean_profile_category(df):
    ''' replace "  " with "unknown" '''
    clean_df = df.apply(lambda x: x.astype(str).str.replace(' ', 'unknown'))
    return clean_df

In [None]:
category = ['profile_category']
category_transformer = Pipeline(steps=[
    ('clean', FunctionTransformer(clean_profile_category)),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

In [None]:
def lower_location_visibility(df):
    ''' lowercase location visibility feature '''
    clean_df = df.apply(lambda x: x.astype(str).str.lower())
    return clean_df

In [None]:
loc_visib = ['location_public_visibility']
loc_visib_transformer = Pipeline(steps=[
    ('lower', FunctionTransformer(lower_location_visibility)),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

# Boolean Features

In [None]:
boolean_features = ['is_profile_view_size_customized', 
                    'profile_cover_image_status', 
                    'profile_verification_status']
                    
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Not set')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
personal_url = ['personal_url']
personal_url_transformer = Pipeline(steps=[
    ('not_na', FunctionTransformer(pd.notna)),
    ])

# Time Feature

In [None]:
def date2int(df):
    # scale = 10**9
    result = pd.to_datetime(df['profile_creation_timestamp']).astype(int)
    return pd.DataFrame(result)

In [None]:
time = ['profile_creation_timestamp']
time_transformer = Pipeline(steps=[
    ('days', FunctionTransformer(date2int)),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', MinMaxScaler()),
    ])

## Combine Transformers

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features),
        ('user_cat', category_transformer, category),
        ('loc', loc_visib_transformer, loc_visib),
        ('url', personal_url_transformer, personal_url),
        ('time', time_transformer, time),
        ])

### Transform target and couple it with a regressor

In [None]:
transformer = QuantileTransformer(output_distribution='normal')
# regressor = LinearRegression() // Validation: 2.181173029294426, 30%
regressor = SVR(C=0.3)
regr = TransformedTargetRegressor(regressor=regressor,
                                  transformer=transformer)

### Use a pipeline

In [None]:
estimators = [('preprocessor', preprocessor), ('regressor', regr)]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_valid)
# predictions = pipe.predict(test_data)
rmsle_train = rmsle(y_train, pipe.predict(X_train))
rmsle_valid = rmsle(y_valid, predictions)

print(f'Train error: {rmsle_train:.3f}')
print(f'Valid error: {rmsle_valid:.3f}')

Train error: 1.602
Valid error: 1.762


# Train on whole data

In [None]:
X_train_whole = pd.concat([X_train, X_valid])
y_train_whole = pd.concat([y_train, y_valid])
pipe.fit(X_train_whole, y_train_whole)
pred_submission = pipe.predict(test_data)
pred_submission

array([3770.06561117, 2497.72694285,  176.03515242, ..., 5957.59277906,
       1054.44735892,  230.00801223])

# Save Predictions

In [None]:
# save the submission file
output = pd.DataFrame({'Id': test_data['id'], 
                       'Predicted': np.floor(pred_submission).astype('int')})
output.to_csv('submission.csv', index=False)

In [None]:
pipe.get_params()

{'memory': None,
 'preprocessor': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('num',
                                  Pipeline(memory=None,
                                           steps=[('imputer',
                                                   SimpleImputer(add_indicator=False,
                                                                 copy=True,
                                                                 fill_value=None,
                                                                 missing_values=nan,
                                                                 strategy='mean',
                                                                 verbose=0)),
                                                  ('log',
                                                   FunctionTransformer(accept_sparse=False,
                                                    