# **Predicting the Location of the Product**

## Import Libraries & Dataset

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('../input/data-analyzer-tz21/train.csv',encoding='latin1')
target = train['Primary Site Code']
train = train.drop('Primary Site Code',axis=1)
test = pd.read_csv('../input/data-analyzer-tz21/test.csv',encoding='latin1')
submission_columns = test['id']
test = test.drop('id',axis=1)
data = pd.concat([train,test],axis=0)

In [3]:
data.tail()

In [4]:
data.info()

### EDA & Feature Engineering

In [5]:
## checking for null values
plt.figure(figsize=(10,6))
sns.heatmap(data.isnull())

In [6]:
data['Source System Code'].value_counts()

In [7]:
data['Source System Code_0'] = data['Source System Code'].apply(lambda x: x.split('_')[0])

In [8]:
data['Source System Code_0'].value_counts()

In [9]:
plt.figure(figsize=(10,6))
sns.histplot(data['Source System Code_0'],color='b')

In [10]:
data['Minor Code'].value_counts().sort_values(ascending=False).head(50)

In [11]:
target.replace(np.nan,'NaN',inplace=True)

In [12]:
#getting only last word from SDW MPN Key column since the first part is related to prodcut number
data['SDW MPN Key_last']= data['SDW MPN Key'].apply(lambda x: x.split('-')[-1])
data.drop('SDW MPN Key',axis=1,inplace=True)

In [13]:
## label encoding
le=LabelEncoder()
target = le.fit_transform(target)

le_1 =LabelEncoder()
data['SDW MPN Key_last_labels'] = le_1.fit_transform(data['SDW MPN Key_last'])
data.drop('SDW MPN Key_last',axis=1, inplace = True)

le_2 = LabelEncoder()
data['Source System Code_0'] = le_2.fit_transform(data['Source System Code_0'])

le_3 = LabelEncoder()
data['Minor_Code_labels']=le_3.fit_transform(data['Minor Code'])
data.drop('Minor Code',axis=1,inplace=True)

In [14]:
#distribution of target variables
sns.kdeplot(target)

In [15]:
sns.histplot(data['Minor_Code_labels'])

In [16]:
data['Product Description'].isnull().sum()

In [17]:
#getting only numbers from prdouct number column
data['Numbers'] = [''.join(re.findall(r'\d+',i)) for i in data['Product Number']]

data['Numbers'].replace('',np.nan,inplace=True)
data['Numbers'].fillna(data['Numbers'].mode()[0],inplace=True)
data['Numbers'] = data['Numbers'].astype('int') 

In [18]:
data.drop('Product Number',axis=1,inplace=True)
data.drop('Product Description',axis=1,inplace=True)

In [19]:
## categorical encoding (can also use one hot encoding)
dum = pd.get_dummies(data['Source System Code'],drop_first=True)
data = pd.concat([dum,data],axis=1)
data.drop('Source System Code',axis=1,inplace=True)

In [20]:
data.head()

In [21]:
data.info()

## Splitting the Data

In [22]:
train_x = data[:len(train)]
test_x = data[len(train):]

X_train, X_val, y_train, y_val = train_test_split(train_x,target,test_size=0.2,random_state=0)

## Building Model and Evaluation

In [23]:
xgb = XGBClassifier(tree_method='gpu_hist',gpu_id='0')
xgb.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],verbose=0)
accuracy_score(xgb.predict(X_val),y_val)

In [24]:
pred = xgb.predict(test_x)
sub = pd.DataFrame({'id':submission_columns,'Primary Site Code':le.inverse_transform(pred)})

In [25]:
le.inverse_transform(pred)

In [26]:
sub.to_csv('final_preds.csv',index=False)