In [96]:
import numpy as np 
import pandas as pd 

In [97]:
test = pd.read_csv('data/playground_test.csv')
train = pd.read_csv('data/playground_train.csv')

In [98]:
train.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold
70123,70123,2020-12-31,Spain,KaggleMart,Kaggle for Kids: One Smart Goose,614
70124,70124,2020-12-31,Spain,KaggleRama,Kaggle Advanced Techniques,215
70125,70125,2020-12-31,Spain,KaggleRama,Kaggle Getting Started,158
70126,70126,2020-12-31,Spain,KaggleRama,Kaggle Recipe Book,135
70127,70127,2020-12-31,Spain,KaggleRama,Kaggle for Kids: One Smart Goose,202


In [99]:
test.tail()

Unnamed: 0,row_id,date,country,store,product
17515,87643,2021-12-31,Spain,KaggleMart,Kaggle for Kids: One Smart Goose
17516,87644,2021-12-31,Spain,KaggleRama,Kaggle Advanced Techniques
17517,87645,2021-12-31,Spain,KaggleRama,Kaggle Getting Started
17518,87646,2021-12-31,Spain,KaggleRama,Kaggle Recipe Book
17519,87647,2021-12-31,Spain,KaggleRama,Kaggle for Kids: One Smart Goose


In [100]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70128 entries, 0 to 70127
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   row_id    70128 non-null  int64 
 1   date      70128 non-null  object
 2   country   70128 non-null  object
 3   store     70128 non-null  object
 4   product   70128 non-null  object
 5   num_sold  70128 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 3.2+ MB


In [101]:
train.isnull().sum()

row_id      0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

# ***Building a Pipeline***

In [102]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor


In [103]:
train['date'] = train['date'].map(lambda x: x.replace('-', ''))
test['date'] = test['date'].map(lambda x: x.replace('-', ''))

test = test.drop('row_id', axis=1)
test.head()

Unnamed: 0,date,country,store,product
0,20210101,Belgium,KaggleMart,Kaggle Advanced Techniques
1,20210101,Belgium,KaggleMart,Kaggle Getting Started
2,20210101,Belgium,KaggleMart,Kaggle Recipe Book
3,20210101,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose
4,20210101,Belgium,KaggleRama,Kaggle Advanced Techniques


In [104]:
features = ['date', 'country', 'store', 'product']
X = train[features]
y = train['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)



In [105]:
# gather categorical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and
                   X_train[cname].dtype == 'object']

# gather numerical columns 
numerical_cols = [cname for cname in X_train.columns if 
                 X_train[cname].dtype in ['int64', 'float64']]

numerical_cols.append('date')

print(f'Categorical columns: {categorical_cols}\n Numerical Columns:{numerical_cols}')

Categorical columns: ['country', 'store', 'product']
 Numerical Columns:['date']


In [106]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


model = XGBRegressor()

In [107]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)
                             ])

my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(test)

# score = mean_absolute_error(y_train, preds)
# print('MAE:', score)

# ***Submission***

In [110]:
test = pd.read_csv('data/playground_test.csv')

Id_list = test['row_id']
submission = pd.DataFrame({'row_id': Id_list, 'num_sold': preds})

# to_drop = [x for x in range(17520,70128)]

# sub = submission.drop(to_drop)

# sub.tail()
len(submission)

17520

In [None]:
submission.to_csv("submission.csv", index=False)