In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd

In [3]:
train_path = '/kaggle/input/adobetraindata/behaviour_simulation_train.csv'
test_path = '/kaggle/input/inter-iit-mid-prep-adobe/problem_1_test_dataset/behaviour_simulation_test_company.xlsx'

In [4]:
test_dataset = pd.read_excel(test_path)

In [5]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                10000 non-null  int64 
 1   date              10000 non-null  object
 2   content           10000 non-null  object
 3   username          10000 non-null  object
 4   media             10000 non-null  object
 5   inferred company  10000 non-null  object
dtypes: int64(1), object(5)
memory usage: 468.9+ KB


In [6]:
train_dataset = pd.read_csv(train_path)

In [7]:
# train_data['likes_binned'].value_counts()

In [8]:
# np.unique(y_binned,return_counts=True)

In [9]:
# le.classes_

In [10]:
# np.argmax(train_data['likes_binned'].isna())

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
import datetime

# Custom transformer for time features
class TimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_['date'] = pd.to_datetime(X_['date'])
        X_['hour'] = X_['date'].dt.hour
        X_['day_of_week'] = X_['date'].dt.dayofweek
        X_['month'] = X_['date'].dt.month
        X_['is_weekend'] = X_['day_of_week'].isin([5, 6]).astype(int)
        return X_.drop('date', axis=1)

# Load data
train_data = train_dataset.copy()
test_data = test_dataset.copy()

# Separate features and target
X = train_data.drop('likes', axis=1)
y = train_data['likes']

# Define categorical and numerical columns
cat_cols = ['inferred company', 'username']
num_cols = ['hour', 'day_of_week', 'month', 'is_weekend']  # These will be created by TimeFeatureExtractor

X = TimeFeatureExtractor().transform(X)
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
#         ('time', TimeFeatureExtractor(), ['date']),
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Log transformation for target variable
def log_transform(x):
    return np.log1p(x)

def inverse_log_transform(x):
    return np.expm1(x)

# Create pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42))
])

# Split data for validation (stratify by binned likes to maintain distribution)
from sklearn.preprocessing import LabelEncoder

# Define custom bins
bins = [0, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 600000]
labels = ['0-100', '101-500', '501-1k', '1k-2k', '2k-5k', '5k-10k', '10k-20k', '20k-50k', '50k-100k', '100k+']

# Bin the likes into categories
train_data['likes_binned'] = pd.cut(train_data['likes'], bins=bins, labels=labels, include_lowest=True)

# Convert bin labels to numeric categories
le = LabelEncoder()
y_binned = le.fit_transform(train_data['likes_binned'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_binned)

# Mark unseen companies in test data
def handle_unknown_companies(X, train_companies):
    X_ = X.copy()
    X_['inferred company'] = X_['inferred company'].apply(lambda x: x if x in train_companies else 'unknown_company')
    return X_['inferred company']

# Extract companies seen in training
train_companies = X_train['inferred company'].unique()

# Modify train-test split section to handle this:
X_train['inferred company'] = handle_unknown_companies(X_train, train_companies)
X_val['inferred company'] = handle_unknown_companies(X_val, train_companies)
test_data['inferred company'] = handle_unknown_companies(test_data, train_companies)

# Log transform the target variable
y_train_log = log_transform(y_train)
y_val_log = log_transform(y_val)

In [12]:
preprocessor.fit(X_train[num_cols+cat_cols])

In [13]:
X_train = preprocessor.transform(X_train[num_cols+cat_cols])
X_val = preprocessor.transform(X_val[num_cols+cat_cols])

In [14]:
# XGBoost model with evaluation metrics and early stopping
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,  # Higher number of trees
    max_depth=15,        # Deep trees for better learning
    learning_rate=0.05, # Smaller learning rate
    subsample=0.8,      # Subsampling for robust training
    colsample_bytree=0.8,  # Feature subsampling
    random_state=42
)

# Fit the model with early stopping on validation data
xgb_model.fit(
    X_train, y_train_log,
    eval_set=[(X_train, y_train_log), (X_val, y_val_log)],
    eval_metric="rmse",
    early_stopping_rounds=50,  # Stop if no improvement after 50 rounds
    verbose=True  # Show progress
)

# Make predictions and transform back to original scale
train_predictions_log = xgb_model.predict(X_train)
val_predictions_log = xgb_model.predict(X_val)

# Inverse log transform to get actual predictions
train_predictions = inverse_log_transform(train_predictions_log)
val_predictions = inverse_log_transform(val_predictions_log)

# Calculate metrics
train_mae = mean_absolute_error(y_train, train_predictions)
val_mae = mean_absolute_error(y_val, val_predictions)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

print(f"Train MAE: {train_mae}, Validation MAE: {val_mae}")
print(f"Train RMSE: {train_rmse}, Validation RMSE: {val_rmse}")



[0]	validation_0-rmse:2.55417	validation_1-rmse:2.55184
[1]	validation_0-rmse:2.46814	validation_1-rmse:2.46608
[2]	validation_0-rmse:2.39369	validation_1-rmse:2.39158
[3]	validation_0-rmse:2.32148	validation_1-rmse:2.31924
[4]	validation_0-rmse:2.24804	validation_1-rmse:2.24587
[5]	validation_0-rmse:2.18817	validation_1-rmse:2.18582
[6]	validation_0-rmse:2.12814	validation_1-rmse:2.12576
[7]	validation_0-rmse:2.06708	validation_1-rmse:2.06468
[8]	validation_0-rmse:2.01227	validation_1-rmse:2.00987
[9]	validation_0-rmse:1.96846	validation_1-rmse:1.96564
[10]	validation_0-rmse:1.91804	validation_1-rmse:1.91506
[11]	validation_0-rmse:1.87865	validation_1-rmse:1.87559
[12]	validation_0-rmse:1.83294	validation_1-rmse:1.83001
[13]	validation_0-rmse:1.79389	validation_1-rmse:1.79079
[14]	validation_0-rmse:1.76038	validation_1-rmse:1.75705
[15]	validation_0-rmse:1.72640	validation_1-rmse:1.72336
[16]	validation_0-rmse:1.69228	validation_1-rmse:1.68926
[17]	validation_0-rmse:1.66221	validation

In [15]:
list(zip(train_predictions, y_train))

[(6.4733634, 3),
 (134.62898, 117),
 (7.693845, 10),
 (77.36346, 45),
 (26.450356, 20),
 (23.084866, 24),
 (0.16473344, 0),
 (0.17782705, 0),
 (0.2513021, 0),
 (0.020762889, 0),
 (132.6021, 89),
 (586.46735, 367),
 (320.93085, 248),
 (2.44558, 4),
 (596.7435, 789),
 (30.100367, 5241),
 (1.541543, 4),
 (50.41202, 56),
 (140.67093, 165),
 (278.9483, 185),
 (0.20806684, 0),
 (2.5373514, 2),
 (4.7010245, 4),
 (162.51788, 89),
 (92.41346, 81),
 (1029.2092, 1168),
 (0.21383555, 0),
 (1226.8353, 553),
 (104.319305, 103),
 (307.9415, 281),
 (1.4631144, 1),
 (6.412015, 12),
 (0.17236243, 0),
 (2.7330441, 1),
 (1.1059916, 1),
 (2790.8657, 5182),
 (0.013873989, 0),
 (121.14958, 52),
 (323.22473, 1287),
 (391.38455, 342),
 (0.22350517, 0),
 (82.01055, 51),
 (162.22179, 175),
 (2.459758, 5),
 (31.023897, 23),
 (0.058967154, 0),
 (82.37821, 99),
 (1.2222284, 0),
 (471.82382, 729),
 (0.51742476, 0),
 (166.97739, 151),
 (7.2784286, 1),
 (1642.4186, 923),
 (115.25516, 43),
 (102.25157, 130),
 (99.64461

In [16]:
# np.unique(y_binned,return_counts=True)

In [17]:
# X_train

In [18]:
# # Fit the model
# model.fit(X_train, y_train_log)

# # Evaluate the model
# val_predictions_log = model.predict(X_val)
# val_predictions = inverse_log_transform(val_predictions_log)

# mae = mean_absolute_error(y_val, val_predictions)
# rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
# print(f"Validation MAE: {mae}")
# print(f"Validation RMSE: {rmse}")

# # Evaluate the model
# train_predictions_log = model.predict(X_train)
# train_predictions = inverse_log_transform(train_predictions_log)

# mae = mean_absolute_error(y_train, train_predictions)
# rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
# print(f"Training MAE: {mae}")
# print(f"Training RMSE: {rmse}")

# # # Predict on test set
# # test_predictions_log = model.predict(test_data)
# # test_predictions = inverse_log_transform(test_predictions_log)

# # # Clip predictions to valid range
# # test_predictions = np.clip(test_predictions, 0, 500000)

# # # Save predictions
# # test_data['predicted_likes'] = test_predictions
# # test_data.to_csv('predictions.csv', index=False)

In [19]:
# train_predictions.max(),y_train.max()