ML Flow for Flipkart Sentiment Analysis

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/data.csv")
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [None]:
# Dropping null values
df.dropna(inplace = True)

In [None]:
# Duplicates
df.duplicated().sum()

0

In [None]:
# Reset index
df.reset_index(drop=True, inplace=True)

Function to classify reviews as positive or negative based on ratings

In [None]:
def classify_review(rating):
    if rating >= 3.0:
        return 'Positive'
    else:
        return 'Negative'

In [None]:
df['Sentiment'] = df['Ratings'].apply(classify_review)

In [None]:
# Split the data
x = df['Review text']
y = df['Sentiment']

In [None]:
# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 1)
print(x_train.shape, x_test.shape)

(6410,) (1603,)


In [None]:
# Pre processing on Train and Test Data
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
# Declaring 'Stemming' and 'Lemmatization' variables
Stemming = PorterStemmer()
Lemmatization = WordNetLemmatizer()

Steps involved in data preprocesing :

1. Removing special characters,unwanted numericals
2. Normalize the case(lower)
3. Word Tokenization
4. Removing stop words
5. Stemming or Lemmatization


In [None]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

# Remove specific words from the NLTK stopwords list
stop_words = set(stopwords.words('english'))
stop_words.update({'Hii', 'it', 'Product', 'Shuttle', 'hii', 'flipkart', 'flipkartread', 'product', 'productread', 'read', 'goodread','shuttle', 'Readmore'})


def preprocess(data):
    # Removes special characters
    sentence = re.sub("[^a-zA-Z]", " ", data)

    # Converts words to lowercase
    sentence = sentence.lower()

    # Tokenization
    sentence = sentence.split()

    # Removes the stop words
    sentence = [word for word in sentence if word not in stop_words]

    # Applying lemmatization
    sentence = [Lemmatization.lemmatize(word) for word in sentence]

    # Join the tokens back into a string
    sentence = " ".join(sentence)

    return sentence

# Apply preprocess function to the 'Review text' column in the Badminton dataset
df['Cleaned Review Text'] = df['Review text'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#applying preprocesing on train_data
x_train= x_train.apply(preprocess)

In [None]:
#applying preprocessing on test_data
x_test = x_test.apply(preprocess)


Bag of Words/Count Vectorizer

In [None]:
# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectorizer
vect = CountVectorizer(preprocessor=preprocess)

# use it to extract features from training data
%time x_train_dtm = vect.fit_transform(x_train)

print(x_train_dtm.shape)

CPU times: user 134 ms, sys: 0 ns, total: 134 ms
Wall time: 141 ms
(6410, 2602)


In [None]:
# Import TF-IDF vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate a TF-IDF vectorizer
tfidf_vect = TfidfVectorizer(preprocessor=preprocess)

# Use it to extract features from training data
%time x_train_tfidf = tfidf_vect.fit_transform(x_train)

print(x_train_tfidf.shape)

CPU times: user 121 ms, sys: 986 µs, total: 122 ms
Wall time: 122 ms
(6410, 2602)


Saving the final Data file

In [None]:
import pandas as pd

# Assuming you already have the data in a DataFrame named "Badminton"
# Define the file path
file_path = 'cleaned_data.csv'

# Export the DataFrame to a CSV file
df.to_csv(file_path, index=False)

print("Data has been successfully saved to", file_path)

Data has been successfully saved to cleaned_data.csv


Pipeline code

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
import joblib
from joblib import Memory

In [None]:
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Define pipelines for selected models with both CountVectorizer and TfidfVectorizer
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'svm': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', SVC())
     ], memory=memory),
    'random_forest': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', RandomForestClassifier())
    ], memory=memory),
    'knn': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', KNeighborsClassifier())
    ], memory=memory)
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__alpha': [1, 10]
    },
    'decision_tree': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__max_depth': [None, 5, 10]
    },
    'logistic_regression': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2']
     },
    'svm': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    'random_forest': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    'knn': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__n_neighbors': [i for i in range(3, 21, 2)],  # Experiment with different values of n_neighbors
        'classifier__p': [1, 2, 3]
    }
}
# # Perform GridSearchCV for each algorithm
# best_models = {}

# for algo in pipelines.keys():
#     print("*" * 10, algo, "*" * 10)
#     grid_search = GridSearchCV(estimator=pipelines[algo],
#                                param_grid=param_grids[algo],
#                                cv=5,
#                                scoring='f1',
#                                return_train_score=True,
#                                verbose=1)
#     grid_search.fit(x_train, y_train)
#     best_models[algo] = grid_search.best_estimator_
#     y_pred = grid_search.best_estimator_.predict(x_test)
#     f1 = f1_score(y_test, y_pred, pos_label='Positive', average='weighted')
#     print('F1 Score on Test Data:', f1)


In [None]:
import sys
sys.executable

'/usr/bin/python3'

In [None]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.11.3-py3-none-any.whl (19.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting packaging<24 (from mlflow)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

Auto Logging Experiment Run using MLFlow

Step 1 - Import MLFlow and set the experiment name

In [None]:
import mlflow

mlflow.set_experiment("Flipkart_Sentiment_Prediction")



MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Flipkart_Sentiment_Prediction (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7e76c11432b0>: Failed to establish a new connection: [Errno 111] Connection refused'))

Step 2 - Start the auto logger

In [None]:
mlflow.sklearn.autolog()

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment

Step 3 - Start the experiment run

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
with mlflow.start_run() as run:
    grid_search.fit(x_train, y_train)



Fitting 5 folds for each of 18 candidates, totalling 90 fits




In [None]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo],
                               param_grid=param_grids[algo],
                               cv=5,
                               scoring='accuracy',
                               return_train_score=True,
                               verbose=1
                              )

    mlflow.sklearn.autolog(max_tuning_runs=None)

    with mlflow.start_run() as run:
        %time grid_search.fit(x_train, y_train)

    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(x_test, y_test))

    best_models[algo] = grid_search.best_estimator_
    print()



********** naive_bayes **********
Fitting 5 folds for each of 18 candidates, totalling 90 fits




CPU times: user 19.8 s, sys: 599 ms, total: 20.4 s
Wall time: 25.8 s
Train Score:  0.9232449297971919
Test Score:  0.9238927011852776

********** decision_tree **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits




CPU times: user 46.9 s, sys: 565 ms, total: 47.5 s
Wall time: 52.9 s
Train Score:  0.9145085803432138
Test Score:  0.9114160948222083

********** logistic_regression **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits




CPU times: user 38.4 s, sys: 713 ms, total: 39.1 s
Wall time: 44.4 s
Train Score:  0.9205928237129484
Test Score:  0.916406737367436

********** svm **********
Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 8min 45s, sys: 5.23 s, total: 8min 50s
Wall time: 8min 58s
Train Score:  0.9177847113884555
Test Score:  0.9139114160948222

********** random_forest **********




Fitting 5 folds for each of 81 candidates, totalling 405 fits
CPU times: user 12min 13s, sys: 3.1 s, total: 12min 16s
Wall time: 12min 36s
Train Score:  0.919188767550702
Test Score:  0.9145352464129757

********** knn **********




Fitting 5 folds for each of 243 candidates, totalling 1215 fits
CPU times: user 1h 3min 32s, sys: 7.76 s, total: 1h 3min 39s
Wall time: 39min 56s
Train Score:  0.9099843993759752
Test Score:  0.9126637554585153

