In [1]:
from sklearn.datasets import fetch_20newsgroups 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.snowball import SnowballStemmer 
from textblob import TextBlob
import pandas as pd
import pickle


In [2]:
# Load self-built dataset as dataframe object
train_data = pd.read_csv('Dataset.csv')
# Extract values of Y
train_target = train_data.iloc[:,0]
# Extract values of X
train_data = train_data.iloc[:, 1]
# Convert dataframe object X into numpy array
train_data = train_data.values 


In [3]:
# Categories we have build the dataset on: Construction Work, Agriculture, 
# IT Services, Clothing and Financial Services 
# Create Stemmer Object
snowball = SnowballStemmer('english')

training_data_stm = []

for row, i in enumerate(train_data):
    zen = TextBlob(i)
    temp = ""
    for j in zen.words:
        word = snowball.stem(j) 
        temp += word + " "
    temp += "."
    training_data_stm.append(temp)

In [4]:
# Build a count vectorizer and extract unique word counts from all instances
count_vectorizer = CountVectorizer() 
train_tc = count_vectorizer.fit_transform(training_data_stm)
pickle.dump(train_tc, open('save_vectorizer.pkl', 'wb'))
print("\nDimensions of training data:", train_tc.shape) 


Dimensions of training data: (60, 1320)


In [5]:
# Create Term Frequency - Inverse Document Frequency (tf-idf) transformer
tfidf = TfidfTransformer() 
train_tfidf = tfidf.fit_transform(train_tc) 
pickle.dump(train_tfidf,open('save_tfidf.pkl', 'wb'))

In [6]:
# Define test data  
input = [ 
    'There is a house made of steel rods and cement across the road.',  
    'I am wearing a beautiful dress made of cotton.', 
    'Information technology is at its peak and I need consultancy for the same.', 
    'The soil is dry and crops are cultivated and fertilizers are sprayed.' 
] 

# for row, i in enumerate(input):
#     zen = TextBlob(i)
#     for j in zen.words:
#         word = snowball(j) 
#         input_data[row] = input_data[row] + word

test_data_stm = []

for row, i in enumerate(input):
    zen = TextBlob(i)
    temp = ""
    for j in zen.words:
        word = snowball.stem(j) 
        temp += word + " "
    temp += "."
    test_data_stm.append(temp)

In [7]:
# Training Mulitnomial Naive Bayes Classifier
classifier = MultinomialNB().fit(train_tfidf, train_target) 
pickle.dump(classifier, open("save_output.pkl","wb"))
#classifier = RandomForestClassifier(max_depth=20, n_estimators=15)
#classifier.fit(train_tfidf, training_data.target)

In [8]:
# Build a count vectorizer for test data
input_tc = count_vectorizer.transform(test_data_stm) 

In [9]:
# Create Term Frequency - Inverse Document Frequency (tf-idf) transformer for test data
input_tfidf = tfidf.transform(input_tc)

In [10]:
# Predict the categories 
predictions = classifier.predict(input_tfidf) 

In [11]:
# Return the outputs 
index=0
for i in zip(input): 
    print('\nInput:', i, '\nPredicted category:', predictions[index])
    index = index+1


Input: ('There is a house made of steel rods and cement across the road.',) 
Predicted category: Construction Work

Input: ('I am wearing a beautiful dress made of cotton.',) 
Predicted category: Clothing

Input: ('Information technology is at its peak and I need consultancy for the same.',) 
Predicted category: IT service

Input: ('The soil is dry and crops are cultivated and fertilizers are sprayed.',) 
Predicted category: Agriculture
