In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
data = pd.read_parquet(r'/content/drive/MyDrive/Norvartis Datathon/train_data.parquet')

In [3]:
# Instantiate customized metric for model evaluation
def custom_metric(df, predictions):
    df = df.copy()
    df['prediction'] = predictions
    df["date"] = pd.to_datetime(df["date"])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['sum_pred'] = df.groupby(['year', 'month', 'brand', 'country'])['prediction'].transform(sum)
    df['quarter_w'] = np.where(df['quarter'] == 1, 1, np.where(df['quarter'] == 2, 0.75, np.where(df['quarter'] == 3, 0.66, 0.5)))
    metric = np.sqrt((1 / len(df)) * sum(((df['phase'] - df['prediction'])**2) * df['quarter_w'] * df['monthly']))
    return metric

In [4]:
# Create unknown category to fill missing values with
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if data[col].dtype.name == 'category':
        data[col] = data[col].cat.add_categories('Desconocido')

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Preprocessing of df
data[categorical_cols] = data[categorical_cols].fillna('Desconocido')
data['hospital_rate'] = data['hospital_rate'].fillna(-1)
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.drop(['phase', 'monthly'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Split data into features and target
X = data.drop('phase', axis=1)
y = data['phase']

# Split data into test and training data. Keep 2021 as test data
X_train = X[X.date.dt.year != 2021]
X_test = X[X.date.dt.year == 2021]
y_train = y[X_train.index]
y_test = y[X_test.index]

# Create training sets with and without monthly since its not used by the model but it will be used by the metric
train_monthly = X_train['monthly']
X_train = X_train.drop(['monthly'], axis=1)

# Instantiante voting regressor model using gradient boos and linear regression
ensemble_model = VotingRegressor(estimators=[('gb', GradientBoostingRegressor()), ('lr', LinearRegression())])

# Model to be tested
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Voting Regressor": ensemble_model
}

# Make sure all needed columns are present
necessary_columns = data[['date', 'brand', 'country', 'monthly']]
X_test_merged = X_test.merge(necessary_columns, left_index=True, right_index=True, how='left')
X_test_with_phase = X_test_merged.assign(phase=y_test)

# Evaluate all models
predictions_compiled = []
for name, model in models.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('feature_selection', SelectKBest(score_func=f_regression, k=4)),
                                     ('regressor', model)])
    model_pipeline.fit(X_train, y_train)
    predictions = model_pipeline.predict(X_test)
    score = custom_metric(X_test_with_phase.assign(date=data['date'], brand=data['brand'], country=data['country'], monthly=data['monthly']), predictions)
    predictions_compiled.append(predictions)
    print(f"The {name} model scored a custom metric of {score}")


The Linear Regression model scored a custom metric of 0.010320370898062717
The Random Forest model scored a custom metric of 0.010324680982136618
The Voting Regressor model scored a custom metric of 0.01030987597740202


In [6]:
# Check pipeline of the last model tested
display(model_pipeline)

In [7]:
# Define function to normalize phase values
def normalize_group(group):
    group_sum = group['phase'].sum()
    group['phase'] = group['phase'] / group_sum
    return group

# Create a copy of the test data for normalized predictions
predictions_nor = X_test.copy()

# Iterate through compiled predictions for different models
for i, pred in enumerate(predictions_compiled):
  # Identify the model for readability in the output
  if i == 0:
    model = "Linear Regression"
  elif i == 1:
    model = "Random Forest"
  else:
    model = "Voting Regressor"

  # Assign predictions to 'predictions_nor' DataFrame
  predictions_nor['phase'] = pred

  # Group by specified columns and apply the normalization function
  predictions_nor = predictions_nor.groupby(['brand', 'country', 'month', predictions_nor.date.dt.year], group_keys=False).apply(normalize_group)

  # Evaluate the custom metric after normalizing phase predictions
  score = custom_metric(X_test_with_phase.assign(date=data['date'], brand=data['brand'], country=data['country'], monthly=data['monthly']), predictions_nor['phase'])

  # Print the evaluation results
  print(f"The {model} model scored a custom metric of {score} after normalizing the phase predictions")


The Linear Regression model scored a custom metric of 0.010319750821393995 after normalizing the phase predictions
The Random Forest model scored a custom metric of 0.010323669312032565 after normalizing the phase predictions
The Voting Regressor model scored a custom metric of 0.010309276560724185 after normalizing the phase predictions


In [8]:
# Separar los datos en características y objetivo
X = data.drop(['phase'], axis=1)
y = data['phase']

# Create a pipeline with preprocessor, feature selection, and Linear Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=4)),
    ('regressor', LinearRegression())
])

# Drop 'monthly' column from features
X_def = X.drop(columns=['monthly'])

# Extract target variable corresponding to the reduced feature set
y_def = y[X_def.index]

# Fit the pipeline to the reduced feature set and target variable
model_pipeline.fit(X_def, y_def)

In [9]:
# Import submission data for predictions
file_path = r'/content/drive/MyDrive/Norvartis Datathon/submission_data.parquet'
predict_data = pd.read_parquet(file_path)

In [10]:
# Fill missing values in data frame
categorical_cols = predict_data.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if predict_data[col].dtype.name == 'category':
        predict_data[col] = predict_data[col].cat.add_categories('Desconocido')
predict_data[categorical_cols] = predict_data[categorical_cols].fillna('Desconocido')
predict_data['hospital_rate'] = predict_data['hospital_rate'].fillna(-1)

In [11]:
# Use the fitted pipeline to make predictions on new data
predictions = model_pipeline.predict(predict_data)

# Assign the predicted 'phase' values to the 'predict_data' DataFrame
predict_data['phase'] = predictions

# Group the DataFrame by specified columns and calculate the sum of 'phase' within each group
grouped_predictions = predict_data.groupby(['brand', 'country', 'month', predict_data.date.dt.year])['phase'].sum()

In [12]:
def normalize_group(group):
    group_sum = group['phase'].sum()
    group['normalized_phase'] = group['phase'] / group_sum
    return group

# Aplicar la normalización
predict_data = predict_data.groupby(['brand', 'country', 'month', predict_data.date.dt.year]).apply(normalize_group)

# Comprobar las sumas después de la normalización
check_sums = predict_data.groupby(['brand', 'country', 'month', predict_data.date.dt.year])['normalized_phase'].sum()

# Verificar si las sumas son cercanas a 1 con una tolerancia (por ejemplo, 1e-
assert predict_data.groupby(['brand', 'country', 'month', predict_data.date.dt.year])['normalized_phase'].sum().mean() == 1

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  predict_data = predict_data.groupby(['brand', 'country', 'month', predict_data.date.dt.year]).apply(normalize_group)


In [13]:
# Define the file path to the submission template CSV file
file_path = r'/content/drive/MyDrive/Norvartis Datathon/submission_template.csv'

# Read the submission template CSV file into a DataFrame
submission = pd.read_csv(file_path)

# Reset the index of the 'predict_data' DataFrame and sort it by specified columns
predict_data.reset_index(drop=True, inplace=True)
predict_data = predict_data.sort_values(by=['country', 'brand', 'date']).reset_index(drop=True)

# Add a new column 'prediction' to the 'submission' DataFrame and populate it with normalized phase values from 'predict_data'
submission['prediction'] = predict_data['normalized_phase']

# Set file name to save
file_name = "test_submission"

# Save the modified 'submission' DataFrame to a new CSV file
submission.to_csv(f'/content/drive/MyDrive/Norvartis Datathon/GitHub files/submission_files/{file_name}.csv')
