This code takes in a dataset of cloth reviews from women to train a Support Vector Machine Classifier. The model then takes in new reviews and assigns them score from 1-5 indicating good or bad reviews

In [26]:
import re
import nltk
import pandas as pd
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm, naive_bayes, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize

We clean the data for any irregularities, nan values, and we also perform some extra operations. We remove unnecessary columns from the data, drop the duplicate values, and also remove any nan values.


In [27]:
# importing csv data into pandas dataframe
data = pd.read_csv('data/womenReviews.csv')
df = pd.DataFrame(data)  # now the df variable contains our data in a dataframe
# print(df) 

In [28]:
# removing unnecessary columns
cols2use = ['title', 'rating']  # for simplicity we are taking only the title, and rating columns from the dataset
# if columns is not in cols2use, then we drop the columns
df2use = df.drop([x for x in df.columns if x not in cols2use], axis=1)
# print(df2use)

In [29]:
# removing the duplicate values if any
df2use.drop_duplicates(inplace=True)
#dropping nan values from the dataset
df2use.dropna(inplace=True)

We download necessary corpus from nltk. We remove the unnecessary common wods, and also reduce the words to their root forms.

In [30]:
# downloading the stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/astra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/astra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/astra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
# setting up stopwords from english using nltk's corpus
# stopwords are common words that we filter out because they dont
# carry much meaning on their own eg: a, the, in etc
stop_words = set(stopwords.words('english'))
# lemmatizer is a tool that reduces the words to their root form.
# eg: loving to love, hated to hate etc
lemmatizer = WordNetLemmatizer()

This function takes in the reviews, and performs operations like converting texts to lowercase, removing numbers from reviews, removing punctuations, removing common words, and converting the words to their root form.

In [32]:
def preprocess_text(text):
    # we convert all the text to lowercase to avoid duplicacy due to cases
    text = text.lower()  
    # we then remove any numbers from the text
    text = re.sub(r'\d+', '', text)  
    # also, we remove punctuations from the text as they donot carry much meaning for our purpose
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    # now, we split the text into individual words
    tokenized_text = word_tokenize(text)  # Tokenization
    # then, we convert each word to root word, and remove common words
    text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in stop_words]
    return ' '.join(text)

In [33]:
# we are now applying the pre-processing function to each of the reviews
# in our dataset
df2use['title'] = df2use['title'].apply(preprocess_text)

In [34]:
# vectorizer converts the text into vectors that can be used by the ml models, 
vectorizer = TfidfVectorizer()
# here we are working in 2 steps
# 1. fit learns the words in our dataset
# 2. transform uses the learned words to encode the review into the vectors
X = vectorizer.fit_transform(df2use['title']).toarray() #converting to array

In [35]:
# y is already integer so it doesnt need to be transformed
y = df2use['rating'].astype(int)

We download necessary corpus from nltk. We remove the unnecessary common wods, and also reduce the words to their root forms.

In [36]:
# then we split the data into training and testing datasetr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

[nltk_data] Downloading package stopwords to /home/astra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/astra/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/astra/nltk_data...


True

In [58]:
# Define the parameter grid
param_grid = {
    'n_estimators': [200],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'class_weight': [None],
}


In [59]:
# Create a base model
rf = RandomForestClassifier(random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, verbose=2, n_jobs=-1)

In [60]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  52.8s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  53.9s


In [61]:
print('Best parameters:', grid_search.best_params_)

Best parameters: {'bootstrap': False, 'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


In [62]:
# Get the best model out of GridSearchCV
model = grid_search.best_estimator_

In [13]:
# we are using svc from support vector machine as our classifier
# classifier or model making
model = svm.SVC()
model.fit(X_train, y_train)  # fitting the data into the classifier

We check the accuracy of the trained model.

In [63]:
# now we make predictions for the test data
prediction = model.predict(X_test)
# calculating the accuracy of the prediction of the test data
accuracy = metrics.accuracy_score(y_test,prediction)
print(accuracy)

0.5518630412890232


After we are satisfied with our accuracy, we save the model. For now, we are using pickle to save the model.

In [15]:
import joblib
# Save the model
joblib.dump(model, 'model/model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'model/vectorizer.pkl')

['vectorizer.pkl']