In [None]:
from sklearn.datasets import fetch_20newsgroups 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.snowball import SnowballStemmer 
from textblob import TextBlob
import pandas as pd
import pickle


In [None]:
# Load self-built dataset as dataframe object
train_data = pd.read_csv('Dataset.csv')
# Extract values of Y
train_target = train_data.iloc[:,0]
# Extract values of X
train_data = train_data.iloc[:, 1]
# Convert dataframe object X into numpy array
train_data = train_data.values 


In [None]:
# Categories we have build the dataset on: Construction Work, Agriculture, 
# IT Services, Clothing and Financial Services 
# Create Stemmer Object
snowball = SnowballStemmer('english')

for row, i in enumerate(train_data):
    zen = TextBlob(i)
    for j in zen.words:
        word = snowball(j) 
        training_data[row] = training_data[row] + word

In [None]:
# Build a count vectorizer and extract unique word counts from all instances
count_vectorizer = CountVectorizer() 
train_tc = count_vectorizer.fit_transform(training_data)
pickle.dump(train_tc, open('save_vectorizer.pkl', 'wb'))
print("\nDimensions of training data:", train_tc.shape) 

In [None]:
# Create Term Frequency - Inverse Document Frequency (tf-idf) transformer
tfidf = TfidfTransformer() 
train_tfidf = tfidf.fit_transform(train_tc) 
pickle.dump(train_tfidf,open('save_tfidf.pkl', 'wb'))

In [None]:
# Define test data  
input_data = [ 
    'There is a house made of steel rods and cement across the road.',  
    'I am wearing a beautiful dress made of cotton.', 
    'Information technology is at its peak and I need consultancy for the same.', 
    'The soil is dry and crops are cultivated and fertilizers are sprayed.' 
] 

In [None]:
# Training Mulitnomial Naive Bayes Classifier
classifier = MultinomialNB().fit(train_tfidf, train_target) 
pickle.dump(classifier, open("save_output.pkl","wb"))
#classifier = RandomForestClassifier(max_depth=20, n_estimators=15)
#classifier.fit(train_tfidf, training_data.target)

In [None]:
# Build a count vectorizer for test data
input_tc = count_vectorizer.transform(input_data) 

In [None]:
# Create Term Frequency - Inverse Document Frequency (tf-idf) transformer for test data
input_tfidf = tfidf.transform(input_tc)

In [None]:
# Predict the categories 
predictions = classifier.predict(input_tfidf) 

In [None]:
# Return the outputs 
index=0
for i in zip(input_data): 
    print('\nInput:', i, '\nPredicted category:', predictions[index])
    index = index+1