In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Importation of the datasets

In [2]:
df = pd.read_csv('train-data.csv', sep=';')
df.drop_duplicates(inplace=True)

GSCPI_df = pd.read_csv('GSCPI_data.csv')
GSCPI_df[['Year','Month']] = GSCPI_df['Year-Month'].str.split('-',expand=True)
GSCPI_df['Month'] = GSCPI_df['Month'].astype(int)
GSCPI_df['Year'] = GSCPI_df['Year'].astype(int)
GSCPI_df.drop(columns=['Year-Month'], inplace=True)

LPI_df = pd.read_csv('LPIextend.csv')

worldbank_economic_data_df = pd.read_csv('worldbank_economic_data.csv')

worldbank_inflation_data_df = pd.read_csv('worldbank_inflation_data.csv')
worldbank_inflation_data_df[['Year','Month']] = worldbank_inflation_data_df['Year-Month'].str.split('-',expand=True)
worldbank_inflation_data_df['Year'] = worldbank_inflation_data_df['Year'].astype(int)
worldbank_inflation_data_df['Month'] = worldbank_inflation_data_df['Month'].astype(int) 
worldbank_inflation_data_df.drop(columns=['Year-Month'], inplace=True)

## Cleaning 

In [3]:
df['Reference proxy'] = df['Reference proxy'].apply(lambda x: x.replace('reference-', ''))
df['Product  Line proxy'] = df['Product  Line proxy'].apply(lambda x: x.replace('Product Line-', ''))
df['Division proxy'] = df['Division proxy'].apply(lambda x: x.replace('Division-', ''))
df['Customer Persona proxy'] = df['Customer Persona proxy'].apply(lambda x: x.replace('Customer Segmentation-', ''))
df['Strategic Product Family proxy'] = df['Strategic Product Family proxy'].apply(lambda x: x.replace('Strategic Product Family-', ''))

country_mapping = {'DE': 'Germany', 'CN': 'China', 'GB': 'United Kingdom', 'AU': 'Australia', 'ES': 'Spain', 'NL': 'Netherlands', 'US': 'United States', 'DK': 'Denmark', 'BE': 'Belgium', 'FR': 'France', 'IT': 'Italy', 'PL': 'Poland', 'SE': 'Sweden', 'TW': 'Taiwan', 'JP': 'Japan', 'HK': 'Hong Kong', 'KR': 'South Korea', 'PH': 'Philippines', 'MY': 'Malaysia', 'SG': 'Singapore', 'TH': 'Thailand', 'Id': 'Indonesia', 'FJ': 'Fiji', 'VN': 'Vietnam', 'BN': 'Brunei', 'NZ': 'New Zealand', 'MM': 'Myanmar', 'KH': 'Cambodia', 'MN': 'Mongolia', 'LA': 'Laos', 'PG': 'Papua New Guinea'}
df['Country'] = df['Country'].map(country_mapping)

Deploying the date

In [4]:
df[['Quarter', 'Year']] = df.Date.str.split(expand=True)
df['Quarter'] = df['Quarter'].map({'jan-apr': 1, 'may-jul': 2, 'may-aug': 2, 'sep-dec': 3})
df.drop('Date', axis=1, inplace=True)

df_melted = pd.melt(df[['Month 1', 'Month 2', 'Month 3', 'Month 4']], var_name='Month', value_name='Sales', ignore_index=False)
df_melted['Month'] = df_melted['Month'].str.replace('Month ', '').astype(int)
df = df.merge(df_melted, left_index=True, right_index=True)
df['Year'] = df['Year'].astype(int)
df['Month'] = df['Month'] + 4*(df['Quarter']-1)
df['Month_cumulated'] = (df['Month']-8) + (df['Year']-2020)*12
df['Month_cumulated'] = df['Month_cumulated'].map({33: 36, 34: 33, 35: 34, 36: 35}).fillna(df['Month_cumulated']).astype(int)
df.drop(['Month 1', 'Month 2', 'Month 3', 'Month 4', 'Quarter'], axis=1, inplace=True)
df = df[df['Month_cumulated']!=36]
df['Sales'] = df['Sales'].apply(lambda x: int(str(x).replace(' ', '')))
df.reset_index(drop=True, inplace=True)



## Merge of all the datasets

In [5]:
df = df.merge(LPI_df, on='Country')
df = df.merge(worldbank_economic_data_df, how='left', on=['Country', 'Year'])
df = df.merge(worldbank_inflation_data_df, how='left', on=['Country', 'Year', 'Month'])
df = df.merge(GSCPI_df, how='left', on=['Year','Month'])


In [6]:
df = df.reindex(columns=['index', 'Year','Month','Month_cumulated','id_product', 'Region', 'Country','Site',
'Operations','Zone','Cluster','Reference proxy','Product  Line proxy','Division proxy',
'Customer Persona proxy','Strategic Product Family proxy','Product Life cycel status','Sales',
'Unnamed: 0','ID','population (2023)','area','landAreaKm','unMember','netChange','growthRate',
'worldPercentage','density','densityMi','rank','LPI Grouped Rank','Customs Score','Customs Grouped Rank',
'Infrastructure Score','Infrastructure Grouped Rank','International Shipments Score',
'International Shipments Grouped Rank','Logistics Competence and Quality Score',
'Logistics Competence and Quality Grouped Rank','Timeliness Score','Timeliness Grouped Rank',
'Tracking and Tracing Score','Tracking and Tracing Grouped Rank','Agriculture, forestry, and fishing, value added (annual % growth)',
'Exports of goods and services (annual % growth)','Final consumption expenditure (annual % growth)',
'GDP (current US$)','Gross capital formation (annual % growth)',
'Imports of goods and services (annual % growth)','Industry (including construction), value added (annual % growth)',
'Manufacturing, value added (annual % growth)','Services, value added (annual % growth)','Energy Price Index','Headline Consumer Price Index',
'GSCPI','Sales'])
df.sort_values(by=['Year', 'Month','Month_cumulated','Region', 'Country','Site','Operations','Zone'])

Unnamed: 0,index,Year,Month,Month_cumulated,id_product,Region,Country,Site,Operations,Zone,...,GDP (current US$),Gross capital formation (annual % growth),Imports of goods and services (annual % growth),"Industry (including construction), value added (annual % growth)","Manufacturing, value added (annual % growth)","Services, value added (annual % growth)",Energy Price Index,Headline Consumer Price Index,GSCPI,Sales
515126,242213,2020,9,1,242213,CHINA,China,AU_DC_ALD,China Operations,China & HK,...,1.468774e+13,4.31,,2.46,,1.95,99.21,128.368035,0.61569,0
497004,238102,2020,9,1,238102,CHINA,China,CN_DC_Beijing,China Operations,China & HK,...,1.468774e+13,4.31,,2.46,,1.95,99.21,128.368035,0.61569,0
498721,237046,2020,9,1,237046,CHINA,China,CN_DC_Beijing,China Operations,China & HK,...,1.468774e+13,4.31,,2.46,,1.95,99.21,128.368035,0.61569,0
498890,223034,2020,9,1,223034,CHINA,China,CN_DC_Beijing,China Operations,China & HK,...,1.468774e+13,4.31,,2.46,,1.95,99.21,128.368035,0.61569,0
500293,230168,2020,9,1,230168,CHINA,China,CN_DC_Beijing,China Operations,China & HK,...,1.468774e+13,4.31,,2.46,,1.95,99.21,128.368035,0.61569,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4428952,2034708,2023,8,35,75852,NAM,United States,US_MF_Missouri,North America Operations,US,...,,,,,,,,,,0
4431984,2033353,2023,8,35,74497,NAM,United States,US_MF_Missouri,North America Operations,US,...,,,,,,,,,,0
4434731,2034604,2023,8,35,75748,NAM,United States,US_MF_Missouri,North America Operations,US,...,,,,,,,,,,0
4435528,2034793,2023,8,35,75937,NAM,United States,US_MF_Missouri,North America Operations,US,...,,,,,,,,,,0


In [7]:
df.head()

Unnamed: 0,index,Year,Month,Month_cumulated,id_product,Region,Country,Site,Operations,Zone,...,GDP (current US$),Gross capital formation (annual % growth),Imports of goods and services (annual % growth),"Industry (including construction), value added (annual % growth)","Manufacturing, value added (annual % growth)","Services, value added (annual % growth)",Energy Price Index,Headline Consumer Price Index,GSCPI,Sales
0,645874,2021,5,9,156160,EUROPE,Germany,NL_DC_Venray,Europe Operations,DACH,...,4259935000000.0,3.5,9.0,3.61,5.07,2.27,107.106,108.17,2.981772,0
1,645874,2021,6,10,156160,EUROPE,Germany,NL_DC_Venray,Europe Operations,DACH,...,4259935000000.0,3.5,9.0,3.61,5.07,2.27,108.058,108.486,2.694348,0
2,645874,2021,7,11,156160,EUROPE,Germany,NL_DC_Venray,Europe Operations,DACH,...,4259935000000.0,3.5,9.0,3.61,5.07,2.27,109.501,109.013,2.922301,0
3,645874,2021,8,12,156160,EUROPE,Germany,NL_DC_Venray,Europe Operations,DACH,...,4259935000000.0,3.5,9.0,3.61,5.07,2.27,110.149,109.118,3.232759,0
4,1119813,2022,1,17,140385,EUROPE,Germany,DE_FO_BNDch,Europe Operations,DACH,...,4072192000000.0,2.3,5.99,-0.47,0.2,2.81,122.041,110.911,3.567542,0


In [12]:
# Sélectionner toutes les colonnes de type 'object'
string_columns = df.select_dtypes(include=['object']).columns

# Appliquer LabelEncoder à chaque colonne de type chaîne
for col in string_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6856015 entries, 0 to 6856014
Data columns (total 56 columns):
 #   Column                                                             Dtype  
---  ------                                                             -----  
 0   index                                                              int64  
 1   Year                                                               int64  
 2   Month                                                              int64  
 3   Month_cumulated                                                    int64  
 4   id_product                                                         int64  
 5   Region                                                             int64  
 6   Country                                                            int64  
 7   Site                                                               int64  
 8   Operations                                                         int64  
 9   Zo