#### Importing required libraries

In [1]:
import pandas as pd
import numpy as np

#### Reading data from csv

In [2]:
df = pd.read_csv('UNSPSCdataset.csv',encoding='mac_roman',low_memory=False)

In [3]:
output = 'UNSPSC_Final'
y = df[output]
features = ['MaterialDescription']
X = df[features]
X.head()

Unnamed: 0,MaterialDescription
0,AIRTEL BILLS 22aug TO 23 AUG 2012
1,AIRTEL MOBILE BILLS 23nov O 22 dec 12
2,aluminum fabrication work
3,aluminum fabrication work
4,civil & plumbing work @ BMT


#### Downloading stopwords from nltk

In [None]:
import nltk
nltk.download()  # Download text data sets, including stop words

In [6]:
import re

from nltk.corpus import stopwords # Import the stop word list

def description_to_words(review_text):
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


In [12]:
clean_review = description_to_words(df['MaterialDescription'][3] )
print(clean_review)

aluminum fabrication work


In [42]:
# Get the number of reviews based on the dataframe column size
num_description = df['MaterialDescription'].size
print("Cleaning and parsing the training set UNSPSC description...\n")
clean_description = []
for i in range(0, num_description):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Description %d of %d\n" % ( i+1, num_description))                                                                  
    clean_description.append( description_to_words(df['MaterialDescription'][i] ))

Cleaning and parsing the training set UNSPSC description...

Description 1000 of 45001

Description 2000 of 45001

Description 3000 of 45001

Description 4000 of 45001

Description 5000 of 45001

Description 6000 of 45001

Description 7000 of 45001

Description 8000 of 45001

Description 9000 of 45001

Description 10000 of 45001

Description 11000 of 45001

Description 12000 of 45001

Description 13000 of 45001

Description 14000 of 45001

Description 15000 of 45001

Description 16000 of 45001

Description 17000 of 45001

Description 18000 of 45001

Description 19000 of 45001

Description 20000 of 45001

Description 21000 of 45001

Description 22000 of 45001

Description 23000 of 45001

Description 24000 of 45001

Description 25000 of 45001

Description 26000 of 45001

Description 27000 of 45001

Description 28000 of 45001

Description 29000 of 45001

Description 30000 of 45001

Description 31000 of 45001

Description 32000 of 45001

Description 33000 of 45001

Description 34000 of 450

#### Printing the list containing the useful words extracted from the training dataset

In [24]:
print(clean_description[:50])

['airtel bills aug aug', 'airtel mobile bills nov dec', 'aluminum fabrication work', 'aluminum fabrication work', 'civil plumbing work bmt', 'electrical wk elr lab mhb', 'glass door fr gastro main door', 'hosp tack fixing st flr medicine', 'magazine week magic pot', 'magazine week magic pot', 'magazine week magic pot dec', 'magazine week magic pot dec', 'marketing referral fee', 'marketing referral fee', 'nuclear medicine ac repair', 'providing fixing corner guards', 'providing fixing corner guards', 'providing laying cable forcctv camera', 'service charges maint bio medical', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr house keeping staff', 'service chrg fr month may feb', 'service chrg fr month may feb',

#### Creating bag of words from the useful word extraction 

In [26]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_description)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [27]:
print(train_data_features.shape)

(45001, 5000)


In [41]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)

['aa', 'aaa', 'aag', 'aage', 'aagsk', 'ab', 'aba', 'abbott', 'abc', 'abdominal', 'abgel', 'abl', 'ablation', 'abraxane', 'absorbent', 'absorbing', 'abutment', 'abzorb', 'ac', 'academic', 'acamprol', 'acc', 'accent', 'accentra', 'accents', 'accept', 'access', 'accessories', 'accessory', 'accident', 'acclaim', 'acco', 'accohide', 'accommodates', 'account', 'accounting', 'accs', 'acct', 'accu', 'accucheck', 'accura', 'accuzon', 'ace', 'aceclo', 'acetone', 'acid', 'acidic', 'acidose', 'aciloc', 'acivir', 'acm', 'acme', 'acne', 'acnt', 'acp', 'acra', 'acrylic', 'act', 'actemra', 'actigut', 'actinocin', 'action', 'active', 'activity', 'acto', 'acton', 'actuator', 'acular', 'acuvert', 'ad', 'adalene', 'adams', 'adapter', 'adapters', 'adaptor', 'add', 'adding', 'additional', 'additions', 'addr', 'address', 'addressing', 'adesam', 'adf', 'adferol', 'adh', 'adhesive', 'adj', 'adjustable', 'adler', 'admenta', 'admin', 'adminission', 'admission', 'admont', 'adrenaline', 'adrovit', 'adult', 'adv', 

#### Checking the histogram of words

In [36]:
each = np.sum(train_data_features, axis=0)
print(each)
'''
for tag,count in zip(vocab, each):
    print(tag, count)
'''

[30 13 29 ...,  4 13  6]


'\nfor tag,count in zip(vocab, each):\n    print(tag, count)\n'

#### Random Forest for training the model

In [43]:
import time

print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
start = time.time()
# This may take a few minutes to run
forest = forest.fit( train_data_features, df['UNSPSC_Final'] )
end = time.time() 

total = end - start

print("Total time taken to train the model is ", total)

Training the random forest...
Total time taken to train the model is  1140.4547901153564


In [1]:
test = pd.read_csv('UNSPSCtestDataSet.csv',encoding='mac_roman',low_memory=False)
# Verify that there are 25,000 rows and 2 columns
print(test.shape)
# Create an empty list and append the clean reviews one by one
num_desc = len(test["MaterialDescription"])
clean_test_desc = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_desc):
    if( (i+1) % 1000 == 0 ):
        print("desc %d of %d\n" % (i+1, num_desc))
    clean_desc = description_to_words(test["MaterialDescription"][i])
    clean_test_desc.append(clean_desc)

NameError: name 'pd' is not defined

In [None]:

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_desc)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"unspscRFcode":result} )

# Use pandas to write the comma-separated output file
output.to_csv( ".csv", index=False)