# The Great Indian Data Scientist Hiring Challenge

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Loading 'Train.csv' file

In [2]:
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Cha...,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /N...,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real ...,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Coloni...,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labo...,CLASS-1376


## Gathering Information and Cleaning Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5566 entries, 0 to 5565
Data columns (total 6 columns):
Inv_Id              5566 non-null int64
Vendor_Code         5566 non-null object
GL_Code             5566 non-null object
Inv_Amt             5566 non-null float64
Item_Description    5566 non-null object
Product_Category    5566 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 261.0+ KB


In [4]:
df.describe()

Unnamed: 0,Inv_Id,Inv_Amt
count,5566.0,5566.0
mean,19010.7871,49.980151
std,2306.607485,28.90303
min,15001.0,0.01
25%,17006.25,24.9575
50%,19017.5,49.645
75%,20994.75,75.17
max,23012.0,99.99


In [5]:
df.shape

(5566, 6)

In [6]:
df.isnull().sum()

Inv_Id              0
Vendor_Code         0
GL_Code             0
Inv_Amt             0
Item_Description    0
Product_Category    0
dtype: int64

In [7]:
sum(df.duplicated())

0

In [8]:
for i in range(10):
    print(df['Item_Description'].iloc[i])

Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design
Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar  Auto Leasing and Maintenance Other Corporate Services
Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018
Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec 
Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources
Final Site Clean Up 2018Mar Store Construction Dravo Corp General Contractor General Requirements
Travel and Entertainment Miscellaneous Company Car (Field Only) Texas New Mexico Power Co Ground Transportation Miscellaneous Company Car (Field Only) 2011-Mar 
General Contractor General Requirements Final Site Clean Up American Pad & Paper Co Apr2014 Store Construction
Aquila Distributors Inc                                 /Bd Prototype and Comp Production/Packaging Design Jul 200

## Cleaning Item_Description for Analysis and Prediction

In [9]:
import re
import nltk
from nltk.corpus import stopwords

In [10]:
import string
stop_words = set(stopwords.words("english"))
def remove_punctuation(text):
    text=text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    text=[i for i in text if i not in string.punctuation]
    no_punc=''.join(text)
    return no_punc

In [11]:
df['Item_Description']=df['Item_Description'].apply(remove_punctuation)

In [12]:
for i in range(10):
    print ((df['Item_Description'].iloc[i]))

artworkingtypesetting production jun 2009 champion parts inc smap prototype comp productionpackaging design
auto leasing corporate services corning inc ny 2013mar auto leasing maintenance corporate services
store management leaserent deltona corp real estate base rent jul2018
store construction general requirements colonial trust iii general contractor final site clean 2005dec
jul 2015 aydin corp contingent labortemp labor contingent labortemp labor corporate services human resources
final site clean 2018mar store construction dravo corp general contractor general requirements
travel entertainment miscellaneous company car field only texas new mexico power co ground transportation miscellaneous company car field only 2011mar
general contractor general requirements final site clean american pad  paper co apr2014 store construction
aquila distributors inc bd prototype comp productionpackaging design jul 2007 artworkingtypesetting production smap
base rent store management chicago rivet  

### 'Item_Description' - Feature Data  and  'Product_Category' - Label Data

# Machine Learning

In [13]:
from sklearn.model_selection import train_test_split

X = df.Item_Description
y = df.Product_Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42) 

## Three Machine Learning models are used for training and testing on the dataset. 
#### Logistic Regression, Random Forest and Decision Tree

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier




### Cross-Validation using Pipeline of CountVectorizer and TfidfTransformer

## Logistic Regression

In [16]:
lr =Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(multi_class='auto', solver='lbfgs')),
               ])




In [17]:
lr.fit(X_train, y_train)
lr.score(X_test, y_test)



0.9910179640718563

## Random Forest

In [18]:
rf =Pipeline([('vect', CountVectorizer()),
             ('tfidf', TfidfTransformer()),
             ('clf', RandomForestClassifier(n_estimators=100)),
             ])


In [19]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9940119760479041

## Decision Tree 

In [20]:
dt =Pipeline([('vect', CountVectorizer()),
             ('tfidf', TfidfTransformer()),
             ('clf', DecisionTreeClassifier())
             ])


In [21]:
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.9958083832335329

## Finding Accuracy for Prediction

In [22]:
from sklearn import metrics

### Linear Regression Accuracy

In [23]:
y_pred = lr.predict(X_test)
print('accuracy %s' % metrics.accuracy_score(y_pred, y_test))

accuracy 0.9910179640718563


### Random Forest Accuracy

In [24]:
y_pred = rf.predict(X_test)
print('accuracy %s' % metrics.accuracy_score(y_pred, y_test))

accuracy 0.9940119760479041


### Decision Tree Accuracy

In [26]:
y_pred = dt.predict(X_test)
print('accuracy %s' % metrics.accuracy_score(y_pred, y_test))

accuracy 0.9958083832335329


## After Cross-Validation Score, Accuracy of Decision Tree is Highest so we use Decision Tree for the Prediction of Product_Category

## Loading Test.csv and Cleaning

In [27]:
dftest = pd.read_csv('Test.csv')
dftest['Item_Description']=dftest['Item_Description'].apply(remove_punctuation)

In [28]:
dftest['Product_Category']=pd.Series(dt.predict(dftest['Item_Description']))


In [29]:
dftest.head(20)

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15003,VENDOR-2513,GL-6050310,56.13,travel entertainment miscellaneous company car...,CLASS-1758
1,15008,VENDOR-1044,GL-6101400,96.56,final site clean store construction advanced m...,CLASS-1522
2,15013,VENDOR-1254,GL-6101400,55.93,arabian american development co final site cle...,CLASS-1522
3,15019,VENDOR-1331,GL-2182000,32.62,corporate services contingent labortemp labor ...,CLASS-1376
4,15020,VENDOR-2513,GL-6050310,25.81,fortune national corp miscellaneous company ca...,CLASS-1758
5,15022,VENDOR-2513,GL-6050310,22.71,dec2007 fortune national corp miscellaneous co...,CLASS-1758
6,15024,VENDOR-1883,GL-2182000,47.38,auto leasing corporate services corning inc ny...,CLASS-1250
7,15026,VENDOR-2543,GL-6020600,26.08,taxes taxes mar 2014 frischs restaurants inc n...,CLASS-2146
8,15027,VENDOR-1944,GL-2182000,42.76,daly john j auto fleet repair maintenance jun ...,CLASS-1249
9,15028,VENDOR-2032,GL-6100500,70.47,smap media buy traditional cgg holding us inc...,CLASS-1850


In [30]:
df_final = dftest[['Inv_Id', 'Product_Category']]
df_final.head()

Unnamed: 0,Inv_Id,Product_Category
0,15003,CLASS-1758
1,15008,CLASS-1522
2,15013,CLASS-1522
3,15019,CLASS-1376
4,15020,CLASS-1758


## Storing the Final Prediction in 'Invoice_Product_Prediction.csv' 

In [31]:
df_final.to_csv('Invoice_Product_Prediction.csv', index = False)