#### Loading Dataset

In [2]:
from clean_data_helper import motech

In [3]:
df = motech()

0    Hacking with Python The Ultimate Beginner’s Guide
1          machine learning for the small and the many
2    Applied Machine Learning, Feature encoding and...
3    Hands-On Machine Learning with Scikit-Learn an...
4    Hands-On Machine Learning with Scikit-Learn an...
Name: BOOK, dtype: object


#### Importing Necessary Libraries

In [5]:
# Silence Warning
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC

#### Feature Exploration

In [21]:
# Checking Missing Values
df.isnull().sum()

CATEGORY       0
BOOK           0
category_id    0
dtype: int64

In [22]:
# Checking the data dtypes
df.dtypes

CATEGORY       object
BOOK           object
category_id     int64
dtype: object

In [23]:
# Check the shape of dataset
df.shape

(198, 3)

In [24]:
# Checking unique BOOKs
df.CATEGORY.unique()

array(['Programming', 'Artificial Intelligence', 'Deep learning',
       'Computer Science', 'deep learning', 'natural language processing',
       'machine learning', 'Python OpenCV', 'Web development',
       'Assembly Language', 'Data science', 'Machine learning'],
      dtype=object)

#### Feature Processing And Dataset Split

In [28]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.BOOK).toarray()
labels = df.category_id

In [34]:
x_train, x_test, y_train, y_test = train_test_split(df['BOOK'], df['CATEGORY'], random_state=0, train_size=0.8, test_size=0.2)

In [35]:
count_vectorizer = CountVectorizer()
x_train_occurences = count_vectorizer.fit_transform(x_train)
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_train_occurences)
model = LinearSVC().fit(x_tfidf, y_train)

In [43]:
from sklearn.model_selection import cross_val_score
v = cross_val_score(model, x_tfidf, y_train, cv=10)
for i in range(10):
    print("Accuracy of SVM is : {0:2%}".format(v[i,]))
    
print("")
print("Mean Accuracy of SVM is ", v.mean())

Accuracy of SVM is : 52.173913%
Accuracy of SVM is : 52.631579%
Accuracy of SVM is : 77.777778%
Accuracy of SVM is : 64.705882%
Accuracy of SVM is : 64.705882%
Accuracy of SVM is : 80.000000%
Accuracy of SVM is : 69.230769%
Accuracy of SVM is : 75.000000%
Accuracy of SVM is : 75.000000%
Accuracy of SVM is : 83.333333%

Mean Accuracy of SVM is  0.6945591370386094


#### Making Prediction of books based on short phrase

In [49]:
# Making Prediction - The Heart of ML
data = 'In classification, the algorithms used is logistic regression, decision tree and naive bayes'
prediction = model.predict(count_vectorizer.transform([data])[0])
print(prediction[0])

machine learning


In [50]:
data = 'Before creating HTML page and doing your Javascript code on the front end make sure you set up your database. I highly recommend you to use MySQl'
prediction = model.predict(count_vectorizer.transform([data])[0])
print(prediction[0])

Web development


In [51]:
data = 'During the face detection and recognition there must be feature extraction of person to be recognized. Most common method used in face recognition in Hear Cascade Classifier'
prediction = model.predict(count_vectorizer.transform([data])[0])
print(prediction[0])

Python OpenCV


#### Dumping and loading the model and vectorizer in disk

In [53]:
import pickle
# Save the model
vec_file = 'vectorizer.pickle'
pickle.dump(count_vectorizer, open(vec_file, 'wb'))

In [56]:
# Save the model
mod_file = 'classification.model'
pickle.dump(model, open(mod_file, 'wb'))

In [66]:
# Loading Vectorizer from Disk
loaded_vectorizer = pickle.load(open('vectorizer.pickle', 'rb'))

# Loading Model from Disk
loaded_model = pickle.load(open('classification.model', 'rb'))

#### Making prediction from real PDF Books

In [76]:
import PyPDF2

# Open PDF file
object = PyPDF2.PdfFileReader('opencv_tutorial.pdf')

# Get number of pages
NumPages = object.getNumPages()
print('Number of pages are ',NumPages)

Text = ''

if(NumPages<100):
    print('PDF Book Must Contain At least 100 Pages In Order To Upload In Rep')
else:
    print('This PDF Qualifies, Waiting To Be Uploaded')
    # Extracting Text of 20 First pages
    for i in range(0,20):
        PageObj = object.getPage(i)
        Text += PageObj.extractText()
        
#print(Text)        

Number of pages are  28
PDF Book Must Contain At least 100 Pages In Order To Upload In Rep


In [75]:
# Making prediction using Model and vectorizer loaded from Disk
prediction = loaded_model.predict(loaded_vectorizer.transform([Text])[0])
print(prediction[0])

Artificial Intelligence


In [77]:
# You're machine learning engineer already....... What next next woooooah

# get prepared to deploy our model on Django, Python web framework for perfectionist 