This code takes in a dataset of cloth reviews from women to train a Support Vector Machine Classifier. The model then takes in new reviews and assigns them score from 1-5 indicating good or bad reviews

In [53]:
import pandas as pd
from sklearn import svm, naive_bayes, linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize

We clean the data for any irregularities, nan values, and we also perform some extra operations. We remove unnecessary columns from the data, drop the duplicate values, and also remove any nan values.


In [39]:
# importing csv data into pandas dataframe
data = pd.read_csv('data/womenReviews.csv')
df = pd.DataFrame(data)  # now the df variable contains our data in a dataframe
# print(df) 

In [40]:
# removing unnecessary columns
cols2use = ['title', 'rating']  # for simplicity we are taking only the title, and rating columns from the dataset
# if columns is not in cols2use, then we drop the columns
df2use = df.drop([x for x in df.columns if x not in cols2use], axis=1)
# print(df2use)

In [41]:
# removing the duplicate values if any
df2use.drop_duplicates(inplace=True)
#dropping nan values from the dataset
df2use.dropna(inplace=True)

We download necessary corpus from nltk. We remove the unnecessary common wods, and also reduce the words to their root forms.

In [21]:
# downloading the stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/astra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/astra/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/astra/nltk_data...


True

In [42]:
# setting up stopwords from english using nltk's corpus
# stopwords are common words that we filter out because they dont
# carry much meaning on their own eg: a, the, in etc
stop_words = set(stopwords.words('english'))
# lemmatizer is a tool that reduces the words to their root form.
# eg: loving to love, hated to hate etc
lemmatizer = WordNetLemmatizer()

This function takes in the reviews, and performs operations like converting texts to lowercase, removing numbers from reviews, removing punctuations, removing common words, and converting the words to their root form.

In [43]:
def preprocess_text(text):
    text = text.lower()  # converting all the reviews to lowercase
    text = re.sub(r'\d+', '', text)  # removing numbers from reviews
    text = re.sub(r'[^\w\s]', '', text)  # removing the punctuations from the reviews
    # now we create tokens i.e extract words from the reviews
    tokenized_text = word_tokenize(text)  # Tokenization
    # then we take each word in the review, and convert it into its root form, and remove common words
    text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in stop_words] 
    return ' '.join(text)

In [44]:
# now we process all the reviews in our dataset using the 
# preprocessing functions
df2use['title'] = df2use['title'].apply(preprocess_text)

In [45]:
# we now convert the text into vectors that can be understood by the machine learning models
vectorizer = TfidfVectorizer()
# fit learns the vocab from the data, and transform uses the learned
# vocab to encode the data
X = vectorizer.fit_transform(df2use['title']).toarray()

In [49]:
# y is already integer so it doesnt need to be transformed
y = df2use['rating'].astype(int)

In [50]:
# then we split the data into training and testing datasetr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
# we are using svc from support vector machine as our classifier
# classifier or model making
model = svm.SVC()
model.fit(X_train, y_train)  # fitting the data into the classifier

We check the accuracy of the trained model.

In [57]:
# now we make predictions for the test data
prediction = model.predict(X_test)
# calculating the accuracy of the prediction of the test data
accuracy = metrics.accuracy_score(y_test,prediction)
print(accuracy)

0.5595837529372273


After we are satisfied with our accuracy, we save the model. For now, we are using pickle to save the model.

In [None]:
import pickle
# # save the model to disk
filename = 'model/clothReviewModel.pkl'
pickle.dump(model, open(filename, 'wb'))

Loading and using the saved model to classify further reviews

In [None]:
# loading the saved model
modelname = 'model/clothReviewModel.pkl'
model = pickle.load(open(modelname, 'rb'))

In [58]:
# creating sample reviews for testing
singleReview = ['The cloth was a great fit. I looked very good, and i felt comfortable.']
reviewSamples = ['it did not fit me at all. Absolutely hated it.', 'it was satisfactory.']

In [59]:
# transforming sample texts into vectors
single_vector = vectorizer.transform(singleReview)
single = single_vector.apply()
reviewSampleVector = vectorizer.transform(reviewSamples)

In [60]:
# checking the predictions of the samples
prediction = model.predict(single_vector)
print(prediction)
prediction = model.predict(reviewSampleVector)
print(prediction)

ValueError: cannot use sparse input in 'SVC' trained on dense data

In [None]:
def preprocess_text(text):
    # we convert all the text to lowercase to avoid duplicacy due to cases
    text = text.lower()  
    # we then remove any numbers from the text
    text = re.sub(r'\d+', '', text)  
    # also, we remove punctuations from the text as they donot carry much meaning for our purpose
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    # now, we split the text into individual words
    tokenized_text = word_tokenize(text)  # Tokenization
    # then, we convert each word to root word, and remove common words
    text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in stop_words]
    return ' '.join(text)

True


In [None]:
# we are now applying the pre-processing function to each of the reviews
# in our dataset
df2use['title'] = df2use['title'].apply(preprocess_text)

In [None]:
# vectorizer converts the text into vectors that can be used by the 
# ml models, 
vectorizer = TfidfVectorizer()
# here we are working in 2 steps
# 1. fit learns the words in our dataset
# 2. transform uses the learned words to encode the review into the vectors
X = vectorizer.fit_transform(df['reviews']).toarray() #converting to array

In [None]:
# standard scalar is used to normalize the data to have a mean of 0 and deviation of 1
scaler = StandardScaler()
# fit learns the meand ad sd of ratings, and transform centers and scale the rating data
y = scaler.fit_transform(df['rating'].values.reshape(-1, 1))
# we reshape as fit_transform requires 2-d array

Now, we split the data into training and testing sets, vectorize them and then fit them into the model. Its normally a good approach to create 2 separate datasets for training and testing rather than just splitting them.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# we are using svc from support vector machine as our classifier
# classifier or model making
model = svm.SVC()
model.fit(X_train, y_train)  # fitting the data into the classifier

We check the accuracy of the trained model.

In [None]:
# now we make predictions for the test data
prediction = model.predict(y_test)
# calculating the accuracy of the prediction of the test data
accuracy = metrics.accuracy_score(y_test,prediction)
print(accuracy)

0.60844768196962


After we are satisfied with our accuracy, we save the model. For now, we are using pickle to save the model.

In [None]:
import pickle
# # save the model to disk
filename = 'model/clothReviewModel.pkl'
pickle.dump(model, open(filename, 'wb'))

Loading and using the saved model to classify further reviews

In [None]:
# loading the saved model
modelname = 'model/clothReviewModel.pkl'
model = pickle.load(open(modelname, 'rb'))

In [None]:
# creating sample reviews for testing
singleReview = ['The cloth was a great fit. I looked very good, and i felt comfortable.']
reviewSamples = ['it did not fit me at all. Absolutely hated it.', 'it was satisfactory.']

In [None]:
# transforming sample texts into vectors
single_vector = vectorizer.transform(singleReview)
reviewSampleVector = vectorizer.transform(reviewSamples)

In [None]:
# checking the predictions of the samples
prediction = model.predict(single_vector)
print(prediction)
prediction = model.predict(reviewSampleVector)
print(prediction)

[5]
[5 5]


In [None]:
# importing csv data into pandas dataframe
data = pd.read_csv('data/womenReviews.csv')
df = pd.DataFrame(data)  # now the df variable contains our data in a dataframe
# print(df) 