# Machine Learning Engineer Nanodegree

In [None]:
# Print stuff inline
%matplotlib inline

# Imports
import re
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import cross_validation
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from string import digits
from bs4 import BeautifulSoup
import warnings

In [None]:
# Load he data and drop columns we don't care about.
try:
    data = pd.read_csv("Amazon_Unlocked_Mobile.csv")
    data.drop(['Product Name', 'Brand Name','Price','Review Votes'], axis = 1, inplace = True)
    print "{} Amazon mobile phone reviews with {} features each.".format(*data.shape)
except:
    print "You need to download the dataset from https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones"

In [None]:
def clean_text(string): 
    warnings.filterwarnings('ignore') # I don't need a lecture, BeautifulSoup
    review_text = BeautifulSoup(string, "lxml").get_text()
    warnings.resetwarnings() # Turn warnings back on
    string = re.sub("[^a-zA-Z]"," ", string)
    string = string.lower()
    return string

In [None]:
data = data[data['Reviews'].isnull()==False]
ratings = data['Rating']
reviews = map(clean_text, data['Reviews'])

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
reviews = vectorizer.fit_transform(reviews)

In [None]:
vectorizer.get_feature_names()

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(reviews, ratings, test_size = 0.3)
print X_train.shape

In [None]:
clf1 = MultinomialNB(alpha=0.0001).fit(X_train, y_train)

In [None]:
clf2 = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
clf3 = linear_model.LogisticRegression().fit(X_train, y_train)

In [None]:
print(clf1.score(X_test, y_test))
# print(clf3.score(X_test, y_test))

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(set(ratings)))
    plt.xticks(tick_marks, set(ratings), rotation=45)
    plt.yticks(tick_marks, set(ratings))
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = confusion_matrix(y_test, clf1.predict(X_test))
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm) 
plt.show()