In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [4]:
file_path = '/Users/harshitgupta/Desktop/vs /VS-Data-Den/data_cleaning/filled_model_dataset.csv'

In [5]:
df = pd.read_csv(file_path)


In [6]:
df['TOTAL_SALES'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TOTAL_SALES'].fillna(0, inplace=True)


In [7]:
df['YEARWEEK'] = pd.to_datetime(df['YEARWEEK'])
df['week_of_year'] = df['YEARWEEK'].dt.isocalendar().week

In [8]:
df['YEARWEEK'] = (df['YEARWEEK'] - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')



In [9]:
df.head()

Unnamed: 0,PTNR_REGIONAL_MASTER_CD,YEARWEEK,TOTAL_SALES,flag,week_of_year
0,0.0,1575763200,28994.45,active,49
1,0.0,1576368000,127699.87,active,50
2,0.0,1576972800,102609.0,active,51
3,0.0,1577577600,0.0,inactive,52
4,0.0,1578182400,0.0,inactive,1


In [10]:
# features = df[['PTNR_REGIONAL_MASTER_CD', 'YEARWEEK', 'week_of_year', 'flag']]
# target = df['TOTAL_SALES']

In [11]:
import matplotlib.pyplot as plt

In [12]:
df['flag'] = df['flag'].map({'inactive': 0, 'active': 1})


In [13]:
grouped = df.groupby('PTNR_REGIONAL_MASTER_CD')

In [None]:
for master_cd, group in grouped:
    features = group[['YEARWEEK', 'week_of_year', 'flag']]
    target = group['TOTAL_SALES']
    

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, shuffle=False)

    train_data = lgb.Dataset(X_train, label=y_train)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    lgb_model = lgb.train(params, train_data)
    
 
    y_pred = lgb_model.predict(X_test)
    

    X_test['YEARWEEK'] = pd.to_datetime(X_test['YEARWEEK'], unit='s')
    
    
    result_df = pd.DataFrame({
        'YEARWEEK': X_test['YEARWEEK'],
        'Actual': y_test,
        'Predicted': y_pred
    })
    

    plt.figure(figsize=(12, 6))
    plt.plot(result_df['YEARWEEK'], result_df['Actual'], label='Actual')
    plt.plot(result_df['YEARWEEK'], result_df['Predicted'], label='Predicted')
    plt.xlabel('YEARWEEK')
    plt.ylabel('TOTAL_SALES')
    plt.title(f'Actual vs. Predicted Sales for PTNR_REGIONAL_MASTER_CD: {master_cd}')
    plt.legend()
    plt.show()

SOME CHANGES


In [24]:
grouped = df.groupby('PTNR_REGIONAL_MASTER_CD')


In [None]:


# for master_cd, group in grouped:
#     features = group[['YEARWEEK', 'week_of_year', 'flag']]
#     target = group['TOTAL_SALES']
    
#     # Split the data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)
    
#     # Create the LightGBM dataset
#     train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['week_of_year', 'flag'])
    
#     # Define the parameters for the LightGBM model
#     params = {
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': 'rmse'
#     }
    
#     # Train the LightGBM model
#     lgb_model = lgb.train(params, train_data)
    
#     # Make predictions on the test set
#     y_pred = lgb_model.predict(X_test)
    
#     # Convert 'YEARWEEK' back to datetime format
#     X_test['YEARWEEK'] = pd.to_datetime(X_test['YEARWEEK'], unit='s')
    
#     # Create a DataFrame for plotting
#     result_df = pd.DataFrame({
#         'YEARWEEK': X_test['YEARWEEK'],
#         'Actual': y_test,
#         'Predicted': y_pred
#     })
    
#     # Plot the actual vs predicted values
#     plt.figure(figsize=(12, 6))
#     plt.plot(result_df['YEARWEEK'], result_df['Actual'], label='Actual')
#     plt.plot(result_df['YEARWEEK'], result_df['Predicted'], label='Predicted')
#     plt.xlabel('YEARWEEK')
#     plt.ylabel('TOTAL_SALES')
#     plt.title(f'Actual vs. Predicted Sales for PTNR_REGIONAL_MASTER_CD: {master_cd}')
#     plt.legend()
#     plt.show()