Corporación Favorita — a major Ecuadorian supermarket and retail chain
Headquartered in Quito, operating stores like Supermaxi and Megamaxi nationwide

# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random

import osmnx as ox
import geopandas as gpd
from geopy.geocoders import Nominatim
from tqdm import tqdm
from geopy.extra.rate_limiter import RateLimiter
from shapely.geometry import Point

from time import sleep

import psycopg2
import warnings
import sqlite3
import sqlalchemy as sa

from sqlalchemy import text
from sqlalchemy import create_engine

import matplotlib.patches as mpatches

import contextily as ctx

import requests
import json
from pathlib import Path
import os, json, time

# opening the data
import re
from datetime import datetime, time

from itertools import combinations
import gc # gc.collect()

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [3]:
import joblib # Save the DataFrame to a compressed file

In [4]:
# from google.colab import files

In [5]:
# !pip install kagglehub
import kagglehub

---------------------------------------------------
---------------------------------------------------

# Data initial review and download

In [6]:
# Download latest version
path = kagglehub.dataset_download("siliconx/favoritagrocerysalesforecastingextracted")

print("Path to dataset files:", path)

Path to dataset files: /Users/georgeshmelin/.cache/kagglehub/datasets/siliconx/favoritagrocerysalesforecastingextracted/versions/1


## Data overview

In [7]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['test.csv', 'train.csv', 'transactions.csv', 'items.csv', 'oil.csv', 'holidays_events.csv', 'sample_submission.csv', 'stores.csv']


In [8]:
def load_csv_to_sqlite(conn, path, filename, preview_rows=50):
    """
    Load up to `preview_rows` from a CSV into SQLite and print summary info.
    Keeps your original structure and logic — just limits rows for inspection.
    """
    file_path = os.path.join(path, filename)
    table_name = filename.replace('.csv', '')

    print(f"\n📂 Loading first {preview_rows} rows from '{filename}' into SQLite...")

    # Read just the first N rows
    df = pd.read_csv(file_path, nrows=preview_rows)

    # Write to SQLite (as a temporary table or small preview)
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    print(f"✅ Loaded preview of '{filename}' into table '{table_name}'")

    # Row count (of the preview)
    query = f"SELECT COUNT(*) AS row_count FROM {table_name};"
    row_count_df = pd.read_sql(query, conn)
    row_count = row_count_df.iloc[0, 0]
    print(f"🔢 Table '{table_name}' currently has {row_count:,} preview rows.")

    # Inspect columns for key info
    columns = df.columns.tolist()
    primary_like = [col for col in columns if 'id' in col.lower() or 'nbr' in col.lower()]
    date_like = [col for col in columns if 'date' in col.lower()]
    possible_keys = primary_like + date_like

    # Show candidate keys
    if possible_keys:
        print(f"🔑 Candidate keys or join fields in '{table_name}': {possible_keys}")
    else:
        print(f"ℹ️ No obvious join keys found in '{table_name}'")

    return df

### oil

In [9]:
# Connect to SQLite
conn = sqlite3.connect("favorita.db")

# Load each CSV file (your KaggleHub path)
load_csv_to_sqlite(conn, path, "oil.csv")


📂 Loading first 50 rows from 'oil.csv' into SQLite...
✅ Loaded preview of 'oil.csv' into table 'oil'
🔢 Table 'oil' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'oil': ['date']


Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2
5,2013-01-08,93.21
6,2013-01-09,93.08
7,2013-01-10,93.81
8,2013-01-11,93.6
9,2013-01-14,94.27


### items

In [10]:
load_csv_to_sqlite(conn, path, "items.csv")


📂 Loading first 50 rows from 'items.csv' into SQLite...
✅ Loaded preview of 'items.csv' into table 'items'
🔢 Table 'items' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'items': ['item_nbr']


Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1
5,105574,GROCERY I,1045,0
6,105575,GROCERY I,1045,0
7,105576,GROCERY I,1045,0
8,105577,GROCERY I,1045,0
9,105693,GROCERY I,1034,0


### holidays_events

In [11]:
load_csv_to_sqlite(conn, path, "holidays_events.csv")


📂 Loading first 50 rows from 'holidays_events.csv' into SQLite...
✅ Loaded preview of 'holidays_events.csv' into table 'holidays_events'
🔢 Table 'holidays_events' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'holidays_events': ['date']


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False


### stores

In [12]:
load_csv_to_sqlite(conn, path, "stores.csv")


📂 Loading first 50 rows from 'stores.csv' into SQLite...
✅ Loaded preview of 'stores.csv' into table 'stores'
🔢 Table 'stores' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'stores': ['store_nbr']


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


### transactions

In [13]:
load_csv_to_sqlite(conn, path, "transactions.csv")


📂 Loading first 50 rows from 'transactions.csv' into SQLite...
✅ Loaded preview of 'transactions.csv' into table 'transactions'
🔢 Table 'transactions' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'transactions': ['store_nbr', 'date']


Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
6,2013-01-02,6,2143
7,2013-01-02,7,1874
8,2013-01-02,8,3250
9,2013-01-02,9,2940


### test

In [14]:
load_csv_to_sqlite(conn, path, "test.csv")


📂 Loading first 50 rows from 'test.csv' into SQLite...
✅ Loaded preview of 'test.csv' into table 'test'
🔢 Table 'test' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'test': ['id', 'store_nbr', 'item_nbr', 'date']


Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,False
1,125497041,2017-08-16,1,99197,False
2,125497042,2017-08-16,1,103501,False
3,125497043,2017-08-16,1,103520,False
4,125497044,2017-08-16,1,103665,False
5,125497045,2017-08-16,1,105574,False
6,125497046,2017-08-16,1,105575,False
7,125497047,2017-08-16,1,105576,False
8,125497048,2017-08-16,1,105577,False
9,125497049,2017-08-16,1,105693,False


### train

In [15]:
load_csv_to_sqlite(conn, path, "train.csv")


📂 Loading first 50 rows from 'train.csv' into SQLite...
✅ Loaded preview of 'train.csv' into table 'train'
🔢 Table 'train' currently has 50 preview rows.
🔑 Candidate keys or join fields in 'train': ['id', 'store_nbr', 'item_nbr', 'date']


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,
5,5,2013-01-01,25,108786,3.0,
6,6,2013-01-01,25,108797,1.0,
7,7,2013-01-01,25,108952,1.0,
8,8,2013-01-01,25,111397,13.0,
9,9,2013-01-01,25,114790,3.0,


In [16]:
# If you already had an old connection open
try:
    conn.close()
    print("🔒 Closed previous database connection.")
except:
    print("ℹ️ No previous connection found or already closed.")

🔒 Closed previous database connection.


### Data samples

### df_train - train.csv is a big in this dataset - like 125 million rows, so I need to download only a small sample

From SQL to DataFrame

In [17]:
# Check table names
conn = sqlite3.connect("favorita.db")
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)

print("✅ New connection established.")
print(tables)

✅ New connection established.
                name
0  sample_submission
1                oil
2              items
3    holidays_events
4             stores
5       transactions
6               test
7              train


In [18]:
database_connection_string = "sqlite:///favorita.db"  # Assuming the database file is named 'favorita.db'

# Create the engine
engine = sa.create_engine(database_connection_string)

In [19]:
# Reconnect to your database
conn = sqlite3.connect("favorita.db")

# Load CSVs with optional row limit (default = 200,000)
def load_csv_to_sqlite_limited(conn, path, filename, limit=200_000):
    file_path = os.path.join(path, filename)
    # Read only up to the given limit
    df = pd.read_csv(file_path, nrows=limit)
    table_name = filename.replace('.csv', '')
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    print(f"✅ Loaded '{filename}' with {len(df):,} rows (max {limit:,}).")

# Load each table (limit to 200,000 rows)
load_csv_to_sqlite_limited(conn, path, "train.csv")
load_csv_to_sqlite_limited(conn, path, "transactions.csv")
load_csv_to_sqlite_limited(conn, path, "oil.csv")
load_csv_to_sqlite_limited(conn, path, "items.csv")
load_csv_to_sqlite_limited(conn, path, "stores.csv")
load_csv_to_sqlite_limited(conn, path, "holidays_events.csv")

✅ Loaded 'train.csv' with 200,000 rows (max 200,000).
✅ Loaded 'transactions.csv' with 83,488 rows (max 200,000).
✅ Loaded 'oil.csv' with 1,218 rows (max 200,000).
✅ Loaded 'items.csv' with 4,100 rows (max 200,000).
✅ Loaded 'stores.csv' with 54 rows (max 200,000).
✅ Loaded 'holidays_events.csv' with 350 rows (max 200,000).


In [20]:
# pd.read_sql("SELECT COUNT(*) AS total, COUNT(onpromotion) AS non_null FROM train;", conn)

In [21]:
query_train = """
WITH top_families AS (
    SELECT items.family
    FROM train
    JOIN items ON train.item_nbr = items.item_nbr
    WHERE train.unit_sales >= 0
    GROUP BY items.family
    ORDER BY SUM(train.unit_sales) DESC
    LIMIT 10
)

SELECT
    train.unit_sales,
    train.date AS train_date,
    train.store_nbr,
    train.item_nbr,
    train.onpromotion,
    oil.dcoilwtico,
    holidays_events.type AS holiday_type,
    holidays_events.locale,
    holidays_events.transferred,
    stores.city,
    stores.type AS store_type,
    stores.cluster AS store_cluster,
    items.family AS items_family,
    items.class AS item_class,
    items.perishable AS perishable_items,
    transactions.transactions
FROM train
LEFT JOIN oil
  ON DATE(train.date) = DATE(oil.date)
LEFT JOIN holidays_events
  ON DATE(train.date) = DATE(holidays_events.date)
INNER JOIN stores
  ON train.store_nbr = stores.store_nbr
INNER JOIN items
  ON train.item_nbr = items.item_nbr
LEFT JOIN transactions
  ON DATE(train.date) = DATE(transactions.date)
     AND train.store_nbr = transactions.store_nbr
WHERE train.unit_sales >= 0
LIMIT 200000;
"""

In [22]:
df_train_download = pd.read_sql_query(query_train, engine)

In [23]:
df_train_download.sample(5)

Unnamed: 0,unit_sales,train_date,store_nbr,item_nbr,onpromotion,dcoilwtico,holiday_type,locale,transferred,city,store_type,store_cluster,items_family,item_class,perishable_items,transactions
75275,62.0,2013-01-03,46,220435,,92.97,,,,Quito,A,14,GROCERY I,1080,0,3438
196131,2.0,2013-01-06,43,262991,,,,,,Esmeraldas,E,10,GROCERY I,1034,0,1194
195089,9.0,2013-01-06,41,114800,,,,,,Machala,D,4,PERSONAL CARE,4126,0,1142
176038,2.0,2013-01-06,11,1113847,,,,,,Cayambe,B,6,DELI,2654,1,3188
5096,10.0,2013-01-02,5,255161,,93.14,,,,Santo Domingo,D,4,GROCERY I,1034,0,1903


In [24]:
df_train = df_train_download.copy()
df_train.shape

(199989, 16)

In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199989 entries, 0 to 199988
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   unit_sales        199989 non-null  float64
 1   train_date        199989 non-null  object 
 2   store_nbr         199989 non-null  int64  
 3   item_nbr          199989 non-null  int64  
 4   onpromotion       0 non-null       object 
 5   dcoilwtico        121814 non-null  float64
 6   holiday_type      43111 non-null   object 
 7   locale            43111 non-null   object 
 8   transferred       43111 non-null   float64
 9   city              199989 non-null  object 
 10  store_type        199989 non-null  object 
 11  store_cluster     199989 non-null  int64  
 12  items_family      199989 non-null  object 
 13  item_class        199989 non-null  int64  
 14  perishable_items  199989 non-null  int64  
 15  transactions      199989 non-null  int64  
dtypes: float64(3), int64

-------------------------------------------------------------------------------

## Data initial cleaning and EDA

### df_train['date']

In [26]:
df_train.rename(columns={'train_date': 'date'}, inplace=True)

In [27]:
df_train['date'] = df_train['date'].str.slice(0, 10)  # trims time part
df_train['date'] = pd.to_datetime(df_train['date'], errors='coerce')  # convert to datetime
df_train.set_index('date', inplace=True)
df_train.sort_index(inplace=True)

In [28]:
# Remove null dates
df_train = df_train[df_train.index.notna()]

# Optional: keep index as datetime but normalize
df_train.index = pd.to_datetime(df_train.index).normalize()

# Sort index
df_train.sort_index(inplace=True)

df_train.index = df_train.index.date

### df_train['unit_sales']

I leave df_train['unit_sales'] as smallest as possible float, because we speak of 0.5 kg of cheese, etc. Not all items are in int numbers

In [29]:
df_train['unit_sales'] = pd.to_numeric(df_train['unit_sales'], downcast='float')

In [30]:
df_train['unit_sales'].nunique()

8000

In [31]:
df_train['unit_sales'].value_counts()

unit_sales
1.000     28834
2.000     23720
3.000     19366
4.000     15658
5.000     12922
          ...  
15.207        1
10.324        1
5.728         1
4.053         1
20.465        1
Name: count, Length: 8000, dtype: int64

### df_train['store_nbr'] - should be as object, as it's store_id rather than math-info

In [32]:
df_train['store_nbr'] = df_train['store_nbr'].astype('object')

In [33]:
df_train['store_nbr'].nunique()

46

In [34]:
df_train['store_nbr'].value_counts() # the count number means - how many times this exact store is appeared in my data-sample

store_nbr
44    6227
45    6142
3     6001
8     5849
6     5637
46    5567
2     5433
37    5257
4     5248
7     5174
5     5137
24    5102
38    5021
9     4991
47    4864
11    4686
39    4684
1     4594
23    4592
27    4581
50    4534
48    4522
49    4517
25    4481
18    4397
41    4372
51    4260
34    4104
28    4079
31    4022
26    3837
17    3687
16    3571
15    3548
12    3526
33    3507
14    3412
40    3341
10    3307
13    3250
19    3175
43    3116
30    2874
35    2777
32    2589
54    2397
Name: count, dtype: int64

### df_train['item_nbr'] - absolutely the same as df_train['store_nbr'], it should be as object

In [35]:
df_train['item_nbr'] = df_train['item_nbr'].astype('object')
df_train['item_nbr'].nunique()

1601

In [36]:
df_train['item_nbr'].value_counts() 

item_nbr
564533     225
759893     225
222879     225
559870     225
314384     225
          ... 
1084365      2
946277       1
586423       1
464112       1
96995        1
Name: count, Length: 1601, dtype: int64

### df_train['onpromotion'] - should as type boolean

In [37]:
df_train['onpromotion'] = df_train['onpromotion'].astype('bool')
df_train['onpromotion'].value_counts()

onpromotion
False    199989
Name: count, dtype: int64

### df_train['dcoilwtico'] - for clarity, I'll rename it into "oil_price"

In [38]:
df_train.rename(columns={'dcoilwtico': 'oil_price'}, inplace=True)

In [39]:
# Round to two decimal places
df_train['oil_price'] = df_train['oil_price'].round(2)

# Downcast to the smallest float type (float16 if possible)
df_train['oil_price'] = pd.to_numeric(df_train['oil_price'], downcast='float')

### df_train['holiday_type']

In [40]:
df_train['holiday_type'].nunique()

2

In [41]:
df_train['holiday_type'] = df_train['holiday_type'].astype('category')
df_train['holiday_type'].value_counts()

holiday_type
Work Day    42533
Holiday       578
Name: count, dtype: int64

### df_train['locale']

In [42]:
df_train['locale'].nunique()

1

In [43]:
df_train['locale'] = df_train['locale'].astype('category')
df_train['locale'].value_counts()

locale
National    43111
Name: count, dtype: int64

### df_train['transferred']

In [44]:
df_train['transferred'].nunique()

1

In [45]:
df_train['transferred'] = df_train['transferred'].astype('bool')
df_train['transferred'].value_counts()

transferred
True     156878
False     43111
Name: count, dtype: int64

### df_train['city']

In [46]:
df_train['city'].nunique()

19

In [47]:
df_train['city'] = df_train['city'].astype('category')
df_train['city'].value_counts()

city
Quito            86157
Guayaquil        26845
Cuenca            9941
Ambato            9126
Santo Domingo     8708
Machala           7713
Latacunga         6776
Loja              5021
Cayambe           4686
Daule             4581
Salinas           4481
Babahoyo          4022
Ibarra            3548
Quevedo           3507
Riobamba          3412
Guaranda          3175
Esmeraldas        3116
Playas            2777
El Carmen         2397
Name: count, dtype: int64

### df_train['store_type']

In [48]:
df_train['store_type'].nunique()

5

In [49]:
df_train['store_type'] = df_train['store_type'].astype('category')
df_train['store_type'].value_counts()

store_type
D    80316
C    44961
A    40633
B    26884
E     7195
Name: count, dtype: int64

df_train['store_type'] column:

Encodes different store formats or sizes (A, B, C, D, E, etc.)
The exact meaning isn’t public, but likely represents market segments
(e.g., A = large urban store, D = small local market, etc.)

### df_train['store_cluster']

In [50]:
df_train['store_cluster'].nunique()

17

In [51]:
df_train['store_cluster'] = df_train['store_cluster'].astype('category')
df_train['store_cluster'].value_counts()

store_cluster
3     21056
14    19487
6     18465
8     17024
15    16806
13    15664
10    15054
4     14530
1     14164
11    10659
9      9840
5      6227
2      5257
16     4397
17     4260
12     3687
7      3412
Name: count, dtype: int64

### df_train['items_family']

In [52]:
df_train['items_family'].nunique()

21

In [53]:
df_train['items_family'] = df_train['items_family'].astype('category')
df_train['items_family'].value_counts()

items_family
GROCERY I           85978
CLEANING            38662
BEVERAGES           18298
DAIRY               10988
DELI                 9459
BREAD/BAKERY         8842
PERSONAL CARE        6400
MEATS                6021
EGGS                 3775
FROZEN FOODS         2845
LIQUOR,WINE,BEER     2006
PREPARED FOODS       1721
POULTRY              1507
LINGERIE              825
GROCERY II            712
AUTOMOTIVE            671
SEAFOOD               533
BEAUTY                356
HARDWARE              181
LAWN AND GARDEN       171
HOME APPLIANCES        38
Name: count, dtype: int64

### df_train['item_class']

In [54]:
df_train['item_class'].nunique()

176

In [55]:
df_train['item_class'] = df_train['item_class'].astype('object')
df_train['item_class'].value_counts()

item_class
1072    6971
1040    6395
1016    6010
1122    5841
3020    5416
        ... 
1338      24
1380      18
1033      14
2242      12
1093       1
Name: count, Length: 176, dtype: int64

### df_train['perishable_items']

In [56]:
df_train['perishable_items'].nunique()

2

In [57]:
df_train['perishable_items'] = df_train['perishable_items'].astype('bool')
df_train['perishable_items'].value_counts()

perishable_items
False    157143
True      42846
Name: count, dtype: int64

### df_train['transactions']

In [58]:
df_train['transactions'] = pd.to_numeric(df_train['transactions'], downcast='integer')

In [59]:
df_train.sample(5)

Unnamed: 0,unit_sales,store_nbr,item_nbr,onpromotion,oil_price,holiday_type,locale,transferred,city,store_type,store_cluster,items_family,item_class,perishable_items,transactions
2013-01-06,35.0,31,257847,False,,,,True,Babahoyo,B,10,BEVERAGES,1120,False,1468
2013-01-03,1.0,27,743497,False,92.970001,,,True,Daule,D,1,FROZEN FOODS,2222,False,928
2013-01-05,3.0,25,368260,False,,Work Day,National,False,Salinas,D,1,GROCERY I,1028,False,1355
2013-01-02,25.0,45,426155,False,93.139999,,,True,Quito,A,11,PREPARED FOODS,2962,True,4208
2013-01-06,5.0,40,368140,False,,,,True,Machala,C,3,GROCERY I,1040,False,1023


In [60]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199989 entries, 2013-01-01 to 2013-01-06
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   unit_sales        199989 non-null  float32 
 1   store_nbr         199989 non-null  object  
 2   item_nbr          199989 non-null  object  
 3   onpromotion       199989 non-null  bool    
 4   oil_price         121814 non-null  float32 
 5   holiday_type      43111 non-null   category
 6   locale            43111 non-null   category
 7   transferred       199989 non-null  bool    
 8   city              199989 non-null  category
 9   store_type        199989 non-null  category
 10  store_cluster     199989 non-null  category
 11  items_family      199989 non-null  category
 12  item_class        199989 non-null  object  
 13  perishable_items  199989 non-null  bool    
 14  transactions      199989 non-null  int16   
dtypes: bool(3), category(6), float32(2), int16(

In [61]:
gc.collect()
df_train.shape

(199989, 15)