#### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('UNSPSCdataset.csv',encoding='mac_roman',low_memory=False)
output = 'UNSPSC_Final'
y = df[output]
features = ['MaterialDescription']
X = df[features]
X.head()

Unnamed: 0,MaterialDescription
0,AIRTEL BILLS 22aug TO 23 AUG 2012
1,AIRTEL MOBILE BILLS 23nov O 22 dec 12
2,aluminum fabrication work
3,aluminum fabrication work
4,civil & plumbing work @ BMT


#### Downloading stopwords from nltk

In [4]:
import nltk
nltk.download()  # Download text data sets, including stop words

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to /home/carnd/nltk_data...
      Unzipping corpora/stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [2]:
import re

from nltk.corpus import stopwords # Import the stop word list

def description_to_words(review_text):
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

# Get the number of reviews based on the dataframe column size
num_description = df['MaterialDescription'].size
print("Cleaning and parsing the training set UNSPSC description...\n")
clean_description = []
for i in range(0, num_description):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%20000 == 0 ):
        print("Description %d of %d\n" % ( i+1, num_description))                                                                  
    clean_description.append( description_to_words(df['MaterialDescription'][i] ))


Cleaning and parsing the training set UNSPSC description...

Description 20000 of 45001

Description 40000 of 45001



#### Creating bag of words from the useful word extraction 

In [3]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 15000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_description)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

Creating the bag of words...

(45001, 10338)


#### XGBOOTING

In [4]:
new_df = pd.DataFrame(train_data_features)
print(type(new_df))

<class 'pandas.core.frame.DataFrame'>


In [5]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
"""
y = df[output]
features = ['MaterialDescription']
X = df[features]

"""



"\ny = df[output]\nfeatures = ['MaterialDescription']\nX = df[features]\n\n"

In [6]:
def modelfit(alg, new_df, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(new_df.values, label=y.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(new_df, y,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(new_df)
    dtrain_predprob = alg.predict_proba(new_df)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
    
    """
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    """

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, new_df, y)

### Touch below after fixing above issues

In [4]:

test = pd.read_csv('UNSPSCtestDataSet.csv',encoding='mac_roman',low_memory=False)
# Verify that there are 25,000 rows and 2 columns
print(test.shape)
# Create an empty list and append the clean reviews one by one
num_desc = len(test["MaterialDescription"])
clean_test_desc = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_desc):
    if( (i+1) % 20000 == 0 ):
        print("desc %d of %d\n" % (i+1, num_desc))
    clean_desc = description_to_words(test["MaterialDescription"][i])
    clean_test_desc.append(clean_desc)
    
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_desc)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"unspscRFcode":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "finalunspscRF8000-500.csv", index=False)

import numpy as np
fromcsv = pd.read_csv("finalunspscRF8000-500.csv")
from sklearn.metrics import accuracy_score
accuracy_score(test['UNSPSC_Final'],fromcsv['unspscRFcode'])

(49657, 2)
Cleaning and parsing the test set movie reviews...

desc 20000 of 49657

desc 40000 of 49657

