In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from xgboost import XGBClassifier
import joblib

In [2]:
aisles_df = pd.read_csv('/kaggle/input/instacart/aisles.csv')
departments_df = pd.read_csv('/kaggle/input/instacart/departments.csv')
# order_products_prior_df = pd.read_csv('order_products__prior.csv.zip')
order_products_train_df = pd.read_csv('/kaggle/input/instacart/order_products__train.csv')
orders_df = pd.read_csv('/kaggle/input/instacart/orders.csv')
products_df = pd.read_csv('/kaggle/input/instacart/products.csv')

In [3]:
df = order_products_train_df.merge(products_df, on='product_id').merge(orders_df, on='order_id').merge(aisles_df, on='aisle_id').merge(departments_df, on='department_id')


In [9]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,112108,train,4,4,10,9.0,yogurt,dairy eggs
1,816049,49302,7,1,Bulgarian Yogurt,120,16,47901,train,14,4,6,16.0,yogurt,dairy eggs
2,1242203,49302,1,1,Bulgarian Yogurt,120,16,2993,train,15,0,7,7.0,yogurt,dairy eggs
3,1383349,49302,11,1,Bulgarian Yogurt,120,16,41425,train,4,3,8,14.0,yogurt,dairy eggs
4,1787378,49302,8,0,Bulgarian Yogurt,120,16,187205,train,5,4,14,30.0,yogurt,dairy eggs


In [10]:
df.shape

(1384617, 15)

In [11]:
df = df.drop(['aisle_id','department_id','product_id','eval_set'],axis=1)
df = df.drop_duplicates()

In [12]:
df.shape

(1384617, 11)

In [14]:
df = df.drop(['order_id'],axis=1)

In [15]:
df.describe()

Unnamed: 0,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,1384617.0,1384617.0,1384617.0,1384617.0,1384617.0,1384617.0,1384617.0
mean,8.758044,0.5985944,103112.8,17.09141,2.701392,13.57759,17.06613
std,7.423936,0.4901829,59487.15,16.61404,2.167646,4.238458,10.42642
min,1.0,0.0,1.0,4.0,0.0,0.0,0.0
25%,3.0,0.0,51732.0,6.0,1.0,10.0,7.0
50%,7.0,1.0,102933.0,11.0,3.0,14.0,15.0
75%,12.0,1.0,154959.0,21.0,5.0,17.0,30.0
max,80.0,1.0,206209.0,100.0,6.0,23.0,30.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   add_to_cart_order       1384617 non-null  int64  
 1   reordered               1384617 non-null  int64  
 2   product_name            1384617 non-null  object 
 3   user_id                 1384617 non-null  int64  
 4   order_number            1384617 non-null  int64  
 5   order_dow               1384617 non-null  int64  
 6   order_hour_of_day       1384617 non-null  int64  
 7   days_since_prior_order  1384617 non-null  float64
 8   aisle                   1384617 non-null  object 
 9   department              1384617 non-null  object 
dtypes: float64(1), int64(6), object(3)
memory usage: 105.6+ MB


In [17]:
# Define the data type mappings for the columns
data_type_mappings = {
    'add_to_cart_order': np.int16,
    'reordered':  np.int8,
    'product_name': str,
    'user_id': int,
    'order_number': np.int16,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
    'days_since_prior_order': np.float32,  
    'aisle': str,
    'department': str
}

# Apply the data type mappings to the DataFrame
for column, data_type in data_type_mappings.items():
    df[column] = df[column].astype(data_type)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   add_to_cart_order       1384617 non-null  int16  
 1   reordered               1384617 non-null  int8   
 2   product_name            1384617 non-null  object 
 3   user_id                 1384617 non-null  int64  
 4   order_number            1384617 non-null  int16  
 5   order_dow               1384617 non-null  int8   
 6   order_hour_of_day       1384617 non-null  int8   
 7   days_since_prior_order  1384617 non-null  float32
 8   aisle                   1384617 non-null  object 
 9   department              1384617 non-null  object 
dtypes: float32(1), int16(2), int64(1), int8(3), object(3)
memory usage: 56.8+ MB


In [19]:
df.isna().sum()

add_to_cart_order         0
reordered                 0
product_name              0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
aisle                     0
department                0
dtype: int64

In [20]:
user_features = df.groupby('user_id').agg({
    'order_number': 'mean',
    'days_since_prior_order': 'mean'
}).reset_index()
data = df.merge(user_features, on='user_id', suffixes=('', '_user'))

In [22]:
data.shape

(1384617, 12)

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 12 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   add_to_cart_order            1384617 non-null  int16  
 1   reordered                    1384617 non-null  int8   
 2   product_name                 1384617 non-null  object 
 3   user_id                      1384617 non-null  int64  
 4   order_number                 1384617 non-null  int16  
 5   order_dow                    1384617 non-null  int8   
 6   order_hour_of_day            1384617 non-null  int8   
 7   days_since_prior_order       1384617 non-null  float32
 8   aisle                        1384617 non-null  object 
 9   department                   1384617 non-null  object 
 10  order_number_user            1384617 non-null  float64
 11  days_since_prior_order_user  1384617 non-null  float32
dtypes: float32(2), float64(1), int16(2), int64

In [141]:
df.head()

Unnamed: 0,add_to_cart_order,reordered,product_name,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department
0,1,1,Bulgarian Yogurt,112108,4,4,10,9.0,yogurt,dairy eggs
1,7,1,Bulgarian Yogurt,47901,14,4,6,16.0,yogurt,dairy eggs
2,1,1,Bulgarian Yogurt,2993,15,0,7,7.0,yogurt,dairy eggs
3,11,1,Bulgarian Yogurt,41425,4,3,8,14.0,yogurt,dairy eggs
4,8,0,Bulgarian Yogurt,187205,5,4,14,30.0,yogurt,dairy eggs


### ReOrder from the store Model

In [66]:
# Create train and test datasets
X = data[['order_number', 'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'aisle', 'department', 'product_name']]
y = data['reordered']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Encode aisle, department, and product_name using Target Encoding after splitting
encoder = TargetEncoder(cols=['aisle','department','product_name'])
X_train[['aisle','department','product_name']] = encoder.fit_transform(X_train[['aisle','department','product_name']], y_train)
X_test[['aisle','department','product_name']] = encoder.transform(X_test[['aisle','department','product_name']])



In [61]:
X_train.head()

Unnamed: 0,order_number,days_since_prior_order,order_dow,order_hour_of_day,aisle,department,product_name
525159,18,10.0,1,17,0.791408,0.675031,0.870798
421659,20,17.0,4,9,0.737746,0.665112,0.88591
1300544,8,17.0,0,10,0.400649,0.425116,0.480045
314776,4,21.0,0,14,0.529728,0.659285,0.512561
749153,8,30.0,0,20,0.737746,0.665112,0.693274


In [62]:
X_test.head()

Unnamed: 0,order_number,days_since_prior_order,order_dow,order_hour_of_day,aisle,department,product_name
619147,15,6.0,2,14,0.606062,0.665112,0.553846
1208847,8,9.0,6,21,0.656726,0.665112,0.526971
42965,39,6.0,6,9,0.584979,0.56985,0.628116
760387,5,7.0,4,19,0.737746,0.665112,0.792741
416646,24,14.0,0,13,0.606062,0.665112,0.567029


In [67]:
# Define the model hyperparameters
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


model_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 4,
    'n_jobs': -1,
    'eta': 0.2,
    'early_stopping_rounds': 100
}

# Define the number of folds    'verbosity': 0,

n_splits = 5

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize the lists to store the ROC AUC scores for each fold
roc_auc_scores = []

# Loop over each fold
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    # Get the training and validation data for this fold
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # Train the XGBoost model on the training data for this fold
    xgb_model = XGBClassifier(**model_params)
    xgb_model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=0)

    # Make predictions on the validation data for this fold
    y_pred = xgb_model.predict_proba(X_valid_fold)[:, 1]

    # Calculate the ROC AUC score for this fold
    roc_auc = roc_auc_score(y_valid_fold, y_pred)
    roc_auc_scores.append(roc_auc)

    # Print the ROC AUC score for this fold
    print(f'Fold {fold}: ROC AUC = {roc_auc}')

# Calculate and print the mean ROC AUC score across all folds
mean_roc_auc = sum(roc_auc_scores) / n_splits
print(f'Mean ROC AUC = {mean_roc_auc}')

Fold 0: ROC AUC = 0.7574679009123141
Fold 1: ROC AUC = 0.7559416356247333
Fold 2: ROC AUC = 0.7573801286793242
Fold 3: ROC AUC = 0.7596700975039944
Fold 4: ROC AUC = 0.755928555723051
Mean ROC AUC = 0.7572776636886834


In [68]:
joblib.dump(xgb_model,'xgb_model.pkl')

['xgb_model.pkl']

## Same Product ReOrdered

In [74]:
# Create train and test datasets
X = data[['user_id', 'product_name', 'order_number', 'days_since_prior_order', 'order_dow', 'order_hour_of_day']]
y = data['reordered']  # Now, 'reordered' is whether the user reordered a specific product
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
encoder2 = TargetEncoder(cols=['product_name'])
X_train[['product_name']] = encoder2.fit_transform(X_train[['product_name']], y_train)
X_test[['product_name']] = encoder2.transform(X_test[['product_name']])


In [78]:
model_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 4,
    'n_jobs': -1,
    'eta': 0.2,
    'early_stopping_rounds': 100
}

# Define the number of folds    'verbosity': 0,

n_splits = 5

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize the lists to store the ROC AUC scores for each fold
roc_auc_scores = []

# Loop over each fold
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    # Get the training and validation data for this fold
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # Train the XGBoost model on the training data for this fold
    same_prod_model = XGBClassifier(**model_params)
    same_prod_model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=0)

    # Make predictions on the validation data for this fold
    y_pred = same_prod_model.predict_proba(X_valid_fold)[:, 1]

    # Calculate the ROC AUC score for this fold
    roc_auc = roc_auc_score(y_valid_fold, y_pred)
    roc_auc_scores.append(roc_auc)

    # Print the ROC AUC score for this fold
    print(f'Fold {fold}: ROC AUC = {roc_auc}')

# Calculate and print the mean ROC AUC score across all folds
mean_roc_auc = sum(roc_auc_scores) / n_splits
print(f'Mean ROC AUC = {mean_roc_auc}')

Fold 0: ROC AUC = 0.7555337260218458
Fold 1: ROC AUC = 0.7576412763894531
Fold 2: ROC AUC = 0.7597318906698497
Fold 3: ROC AUC = 0.7563830262633131
Fold 4: ROC AUC = 0.7569442960094304
Mean ROC AUC = 0.7572468430707783


In [79]:
joblib.dump(same_prod_model,'same_prod_model.pkl')

['same_prod_model.pkl']

In [130]:
joblib.dump(encoder,'target_encoder.pkl')

['target_encoder.pkl']

In [136]:
test_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   add_to_cart_order       1 non-null      int64  
 1   product_name            1 non-null      float64
 2   user_id                 1 non-null      int64  
 3   order_number            1 non-null      int64  
 4   order_dow               1 non-null      int64  
 5   order_hour_of_day       1 non-null      int64  
 6   days_since_prior_order  1 non-null      float64
 7   aisle                   1 non-null      float64
 8   department              1 non-null      float64
dtypes: float64(4), int64(5)
memory usage: 200.0 bytes


In [129]:
# cols1 = ['order_number', 'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'aisle', 'department', 'product_name']
test_input = pd.DataFrame(data= filtered_df,index=range(0,1))

xgb_model = joblib.load('/kaggle/working/xgb_model.pkl')
same_prod_model = joblib.load('/kaggle/working/same_prod_model.pkl')
# reorder model
def predict(test_input):
    test_input[['product_name','aisle','department']] = encoder.transform(test_input[['product_name','aisle','department']])
    input1=test_input[['order_number', 'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'aisle', 'department', 'product_name']]
    reorder_preds = xgb_model.predict(input1).item()
    
    input2 = test_input[['user_id', 'product_name', 'order_number', 'days_since_prior_order', 'order_dow', 'order_hour_of_day']]
    same_preds = np.max(same_prod_model.predict_proba(input2))
    
    return reorder_preds,same_preds

reorder_predis, same_preds = predict(test_input)
print(reorder_predis, same_preds)

1 0.9210505


In [140]:
test_input

Unnamed: 0,add_to_cart_order,product_name,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department
0,2,0.662688,199120,49,3,20,7.0,0.687523,0.675945


In [106]:
filtered_df = df[(df['user_id'] == 199120) & (df['add_to_cart_order'] == 2)].to_dict(orient='records')[0]



{'add_to_cart_order': 2,
 'reordered': 1,
 'product_name': 'Bulgarian Yogurt',
 'user_id': 199120,
 'order_number': 49,
 'order_dow': 3,
 'order_hour_of_day': 20,
 'days_since_prior_order': 7.0,
 'aisle': 'yogurt',
 'department': 'dairy eggs'}

In [107]:
del filtered_df['reordered']


In [132]:
print(joblib.__version__)
print(np.__version__)
print(pd.__version__)

1.3.2
1.23.5
2.0.3


In [135]:
filtered_df.info

AttributeError: 'dict' object has no attribute 'info'

In [139]:
import xgboost
xgboost.__version__

'1.7.6'

In [144]:
data.head()

Unnamed: 0,add_to_cart_order,reordered,product_name,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department,order_number_user,days_since_prior_order_user
0,1,1,Bulgarian Yogurt,112108,4,4,10,9.0,yogurt,dairy eggs,4.0,9.0
1,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,112108,4,4,10,9.0,other creams cheeses,dairy eggs,4.0,9.0
2,8,1,Organic Whole String Cheese,112108,4,4,10,9.0,packaged cheese,dairy eggs,4.0,9.0
3,3,0,Organic Celery Hearts,112108,4,4,10,9.0,fresh vegetables,produce,4.0,9.0
4,4,0,Cucumber Kirby,112108,4,4,10,9.0,fresh vegetables,produce,4.0,9.0


In [143]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [None]:
# # Load your data (sample data provided)
# data = pd.DataFrame({
#     'user_id': [112108, 47901],
#     'product_name': ['Bulgarian Yogurt', 'Bulgarian Yogurt']
## Create a user-product matrix with user interactions (e.g., 1 for purchased, 0 for not)

data = df[['user_id','product_name']]
user_product_matrix = data.pivot_table(index='user_id', columns='product_name', aggfunc='size', fill_value=0)

# # Calculate the cosine similarity between products
# product_similarity = cosine_similarity(user_product_matrix.T)

# # Recommend products for a given user based on product similarity
# def recommend_products(user_id, user_product_matrix, product_similarity, num_recommendations=5):
#     user_products = user_product_matrix.loc[user_id]
#     recommended_products = []

#     for product_name in user_products.index:
#         if user_products[product_name] == 1:
#             # Find similar products
#             similar_products = product_similarity[data['product_name'] == product_name]
#             similar_products = list(enumerate(similar_products[0]))

#             # Sort products by similarity in descending order
#             similar_products = sorted(similar_products, key=lambda x: x[1], reverse=True)[1:]

#             # Extract product names from the indices
#             recommended_products.extend([data['product_name'][i[0]] for i in similar_products[:num_recommendations]])

#     return recommended_products[:num_recommendations]

# # Example usage
# user_id_to_recommend = 112108
# recommendations = recommend_products(user_id_to_recommend, user_product_matrix, product_similarity)







  user_product_matrix = data.pivot_table(index='user_id', columns='product_name', aggfunc='size', fill_value=0)


In [6]:
def recommend_products_for_user(user_id, num_recommendations=5):
    user_data = df[df['user_id'] == user_id]
    user_products = user_data['product_name'].unique()

    # Calculate product scores based on user behavior or other criteria
    # In this simplified example, we recommend the most popular products
    product_scores = df['product_name'].value_counts().reset_index()
    product_scores.columns = ['product_name', 'popularity']

    # Filter out products the user has already purchased
    product_scores = product_scores[~product_scores['product_name'].isin(user_products)]

    # Sort products by popularity and recommend the top N products
    recommended_products = product_scores.nlargest(num_recommendations, 'popularity')['product_name']

    return recommended_products.tolist()

# Usage example:
user_id_to_recommend = 199120
recommended_products = recommend_products_for_user(user_id_to_recommend)
print(f"Recommended products for user {user_id_to_recommend}: {recommended_products}")


Recommended products for user 199120: ['Banana', 'Organic Strawberries', 'Organic Baby Spinach', 'Large Lemon', 'Organic Avocado']


In [7]:
df[['user_id','product_name']].to_csv('recommend_products.csv',index=False)