In [1]:
import pandas as pd 
import numpy as np

In [2]:
data = 'data.csv'
df = pd.read_csv(data)

In [3]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
text = df.dropna(subset = ['Review text'])
text = text['Review text']

In [5]:
import re 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def text_process(text):
    text = str(text)
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '',text)
    clean_text = clean_text.lower()
    tokens = word_tokenize(clean_text)    
    stop_words = set(stopwords.words('english'))
    clean_text = [token for token in tokens if token not in stop_words]
    # clean_text = [lemmatizer.lemmatize(text) for text in clean_text]  
    return " ".join(clean_text)

In [6]:
df['clean_text'] = df['Review text'].apply(text_process)
text = df['clean_text']

In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def sentiment_score(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    sentiment = 'Positive' if compound_score > 0 else 'Negative'
    return sentiment 

In [8]:
%time df['Sentiment'] = text.apply(lambda x: pd.Series(sentiment_score(x)))

CPU times: user 21.6 s, sys: 61.3 ms, total: 21.6 s
Wall time: 21.7 s


In [9]:
df['Sentiment'].value_counts()

Sentiment
Negative    4377
Positive    4141
Name: count, dtype: int64

In [10]:
X = text
y = df['Sentiment']
y = y.dropna()

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.25, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6388,), (2130,), (6388,), (2130,))

In [12]:
import mlflow 
mlflow.set_experiment("feedback_sentiment_prediction")
import warnings 
warnings.filterwarnings('ignore')

2024/04/12 19:35:31 INFO mlflow.tracking.fluent: Experiment with name 'feedback_sentiment_prediction' does not exist. Creating a new experiment.


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report
import joblib 
from joblib import Memory
import os 

In [14]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'random-forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ], memory = memory),
    'SVC' : Pipeline([
        ('vectorization',CountVectorizer()),
        ('classifier', SVC())
    ], memory = memory),
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
}

param_grids = {    
    'logistic_regression': [
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__solver': ['saga'],
        }
    ],
    'decision_tree': [
        {
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'random-forest': [
        {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20],
            'classifier__max_features': ['auto']
        }
    ],
    'SVC':[
        {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__degree': [2, 3, 4]
        }
    ],
    
    'naive_bayes': [
        {
            'classifier__alpha': [1, 10]
        }
    ]
}

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

scorer = make_scorer(f1_score, pos_label='Positive')
best_models = {}

for p in pipelines.keys():
    print("-"*5, p, "-"*5)
    grid_search = GridSearchCV(estimator=pipelines[p], 
                               param_grid=param_grids[p], 
                               cv=5, 
                               scoring= scorer, 
                               return_train_score=True,
                               verbose=1
                              )
    mlflow.sklearn.autolog(max_tuning_runs = None)
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
    
    best_models[p] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))

----- logistic_regression -----




Fitting 5 folds for each of 3 candidates, totalling 15 fits




CPU times: user 3.67 s, sys: 849 ms, total: 4.52 s
Wall time: 4.59 s
Score on Test Data:  0.9760994263862334
----- decision_tree -----
Fitting 5 folds for each of 3 candidates, totalling 15 fits




CPU times: user 2.97 s, sys: 404 ms, total: 3.37 s
Wall time: 3.58 s
Score on Test Data:  0.9773584905660377
----- random-forest -----
Fitting 5 folds for each of 9 candidates, totalling 45 fits




CPU times: user 36.3 s, sys: 2.59 s, total: 38.9 s
Wall time: 38 s
Score on Test Data:  0.9720246562351826
----- SVC -----
Fitting 5 folds for each of 18 candidates, totalling 90 fits




CPU times: user 53.8 s, sys: 2.17 s, total: 55.9 s
Wall time: 55 s
Score on Test Data:  0.9834358731661144
----- naive_bayes -----
Fitting 5 folds for each of 2 candidates, totalling 10 fits
CPU times: user 1.41 s, sys: 1.63 s, total: 3.04 s
Wall time: 2.98 s
Score on Test Data:  0.9295774647887324


In [16]:
for name, model in best_models.items():
    print('-'*10, name, '-' * 10)
    print(model)
    print('-'*100)

---------- logistic_regression ----------
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer()),
                ('classifier', LogisticRegression(C=10, solver='saga'))])
----------------------------------------------------------------------------------------------------
---------- decision_tree ----------
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer()),
                ('classifier', DecisionTreeClassifier())])
----------------------------------------------------------------------------------------------------
---------- random-forest ----------
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer()),
                ('classifier',
                 RandomForestClassifier(max_features='auto',
                                        n_estimators=200))])
--------------------------------------------------------------------------------------------

In [17]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    joblib.dump(model, f'best_models/{name}.pkl')
    model = joblib.load(f'best_models/{name}.pkl')
    %time y_test_pred = model.predict(X_test)
    print("F1 Test Score ", f1_score(y_test, y_test_pred, pos_label='Positive'))
    print("Model Size:", os.path.getsize(f'best_models/{name}.pkl'), "Bytes")

********** logistic_regression **********
CPU times: user 3.97 ms, sys: 35 µs, total: 4 ms
Wall time: 4 ms
F1 Test Score  0.9760994263862334
Model Size: 73433 Bytes
********** decision_tree **********
CPU times: user 4.28 ms, sys: 56 µs, total: 4.34 ms
Wall time: 4.34 ms
F1 Test Score  0.9773584905660377
Model Size: 73544 Bytes
********** random-forest **********
CPU times: user 119 ms, sys: 290 µs, total: 119 ms
Wall time: 119 ms
F1 Test Score  0.9720246562351826
Model Size: 25988960 Bytes
********** SVC **********
CPU times: user 37.9 ms, sys: 60 µs, total: 38 ms
Wall time: 38 ms
F1 Test Score  0.9834358731661144
Model Size: 127525 Bytes
********** naive_bayes **********
CPU times: user 3.8 ms, sys: 9 µs, total: 3.81 ms
Wall time: 3.81 ms
F1 Test Score  0.9295774647887324
Model Size: 152630 Bytes


In [None]:
model = joblib.load('best_models/SVC.pkl')
new_data = [input('enter text')]
model.predict(new_data)

In [None]:
model = joblib.load('best_models/SVC.pkl')
new_data = [input('enter text')]
model.predict(new_data)

In [None]:
df.head()

In [None]:
df.to_csv('badminton_data.csv', index=False)