#### Importing required libraries

In [1]:
import pandas as pd
import numpy as np

#### Reading data from csv

In [2]:
df = pd.read_csv('UNSPSCdataset.csv',encoding='mac_roman',low_memory=False)

In [3]:
output = 'UNSPSC_Final'
y = df[output]
features = ['MaterialDescription']
X = df[features]
X.head()

Unnamed: 0,MaterialDescription
0,AIRTEL BILLS 22aug TO 23 AUG 2012
1,AIRTEL MOBILE BILLS 23nov O 22 dec 12
2,aluminum fabrication work
3,aluminum fabrication work
4,civil & plumbing work @ BMT


#### Downloading stopwords from nltk

In [4]:
import nltk
nltk.download()  # Download text data sets, including stop words

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to /home/carnd/nltk_data...
      Unzipping corpora/stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [4]:
import re

from nltk.corpus import stopwords # Import the stop word list

def description_to_words(review_text):
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


In [5]:
clean_review = description_to_words(df['MaterialDescription'][3] )
print(clean_review)

aluminum fabrication work


In [6]:
# Get the number of reviews based on the dataframe column size
num_description = df['MaterialDescription'].size
print("Cleaning and parsing the training set UNSPSC description...\n")
clean_description = []
for i in range(0, num_description):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Description %d of %d\n" % ( i+1, num_description))                                                                  
    clean_description.append( description_to_words(df['MaterialDescription'][i] ))

Cleaning and parsing the training set UNSPSC description...

Description 1000 of 45001

Description 2000 of 45001

Description 3000 of 45001

Description 4000 of 45001

Description 5000 of 45001

Description 6000 of 45001

Description 7000 of 45001

Description 8000 of 45001

Description 9000 of 45001

Description 10000 of 45001

Description 11000 of 45001

Description 12000 of 45001

Description 13000 of 45001

Description 14000 of 45001

Description 15000 of 45001

Description 16000 of 45001

Description 17000 of 45001

Description 18000 of 45001

Description 19000 of 45001

Description 20000 of 45001

Description 21000 of 45001

Description 22000 of 45001

Description 23000 of 45001

Description 24000 of 45001

Description 25000 of 45001

Description 26000 of 45001

Description 27000 of 45001

Description 28000 of 45001

Description 29000 of 45001

Description 30000 of 45001

Description 31000 of 45001

Description 32000 of 45001

Description 33000 of 45001

Description 34000 of 450

#### Printing the list containing the useful words extracted from the training dataset

In [7]:
print(clean_description[:10])

['airtel bills aug aug', 'airtel mobile bills nov dec', 'aluminum fabrication work', 'aluminum fabrication work', 'civil plumbing work bmt', 'electrical wk elr lab mhb', 'glass door fr gastro main door', 'hosp tack fixing st flr medicine', 'magazine week magic pot', 'magazine week magic pot']


#### Creating bag of words from the useful word extraction 

In [8]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 8000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_description)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

Creating the bag of words...

(45001, 8000)


#### Random Forest for training the model

In [9]:
import time

print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 500) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
start = time.time()
# This may take a few minutes to run
forest = forest.fit( train_data_features, df['UNSPSC_Final'] )
end = time.time() 

total = end - start

print("Total time taken to train the model is ", total)

Training the random forest...
Total time taken to train the model is  1851.8335661888123


In [10]:

test = pd.read_csv('UNSPSCtestDataSet.csv',encoding='mac_roman',low_memory=False)
# Verify that there are 25,000 rows and 2 columns
print(test.shape)
# Create an empty list and append the clean reviews one by one
num_desc = len(test["MaterialDescription"])
clean_test_desc = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_desc):
    if( (i+1) % 1000 == 0 ):
        print("desc %d of %d\n" % (i+1, num_desc))
    clean_desc = description_to_words(test["MaterialDescription"][i])
    clean_test_desc.append(clean_desc)

(49657, 20)
Cleaning and parsing the test set movie reviews...

desc 1000 of 49657

desc 2000 of 49657

desc 3000 of 49657

desc 4000 of 49657

desc 5000 of 49657

desc 6000 of 49657

desc 7000 of 49657

desc 8000 of 49657

desc 9000 of 49657

desc 10000 of 49657

desc 11000 of 49657

desc 12000 of 49657

desc 13000 of 49657

desc 14000 of 49657

desc 15000 of 49657

desc 16000 of 49657

desc 17000 of 49657

desc 18000 of 49657

desc 19000 of 49657

desc 20000 of 49657

desc 21000 of 49657

desc 22000 of 49657

desc 23000 of 49657

desc 24000 of 49657

desc 25000 of 49657

desc 26000 of 49657

desc 27000 of 49657

desc 28000 of 49657

desc 29000 of 49657

desc 30000 of 49657

desc 31000 of 49657

desc 32000 of 49657

desc 33000 of 49657

desc 34000 of 49657

desc 35000 of 49657

desc 36000 of 49657

desc 37000 of 49657

desc 38000 of 49657

desc 39000 of 49657

desc 40000 of 49657

desc 41000 of 49657

desc 42000 of 49657

desc 43000 of 49657

desc 44000 of 49657

desc 45000 of 49657



In [11]:

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_desc)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [12]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"unspscRFcode":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "finalunspscRF8000-500.csv", index=False)

In [14]:
import numpy as np
fromcsv = pd.read_csv("finalunspscRF8000-500.csv")
from sklearn.metrics import accuracy_score
accuracy_score(test['UNSPSC_Final'],fromcsv['unspscRFcode'])

0.57307126890468618