# **Setting up the environment**

In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv("/kaggle/input/flipkart-product-customer-reviews-dataset/Dataset-SA.csv")
df = pd.DataFrame(data)

# **Exploratory Data Analysis**

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

In [None]:
ax = sn.countplot(x='Sentiment',data=data)


In [None]:
#from sklearn.model_selection import train_test_split
#train_set , test_set = train_test_split(data , test_size=0.2,random_state=42)

In [1]:
data = data[data.Sentiment.isin(['neutral']) == False]
df = pd.DataFrame(data)

NameError: name 'data' is not defined

In [None]:
ax = sn.countplot(x='Sentiment',data=data)


In [None]:
data['Sentiment'].value_counts()

In [None]:
df = df.fillna(df.mode().iloc[0])
df.head()


In [None]:
data = df
data.info()

# **Feature Transformations**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
feature_vector = count_vectorizer.fit(data.Review)
features = feature_vector.get_feature_names()
print("Total number of features: ",len(features))

In [None]:
data_features = count_vectorizer.transform(data.Review)

In [None]:
data_features.shape

In [None]:
data_features.getnnz()

In [None]:
print("Density of the matrix: ",
    data_features.getnnz()*100 /  
    (data_features.shape[0]* data_features.shape[1]))


In [None]:
data_df = pd.DataFrame(data_features.todense())
data_df.columns = features

In [None]:
data[0:1]

In [None]:
data_df.iloc[:1,150:157]

In [None]:
features_counts = np.sum(data_features.toarray(), axis = 0)
features_counts_df = pd.DataFrame(dict(features = features , counts = features_counts))

In [None]:
plt.figure(figsize=(12,5))
plt.hist(features_counts_df.counts , bins=50 , range = (0 ,5000))
plt.xlabel('Frequency of words')
plt.ylabel('Density')

In [None]:
len(features_counts_df[features_counts_df.counts == 1])


In [None]:
count_vectorizer = CountVectorizer(max_features=10000)
feature_vector = count_vectorizer.fit(data.Review)
features = feature_vector.get_feature_names()
data_features = count_vectorizer.transform(data.Review)
features_counts = np.sum(data_features.toarray() , axis = 0)
feature_counts = pd.DataFrame(dict(features = features , counts = features_counts))

In [None]:
feature_counts.sort_values('counts',ascending = False)[0:15]

In [None]:

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

english_stop_words = stopwords.words('english')

In [None]:
data['Review'][0:5]

In [None]:
count_vectorizer = CountVectorizer(stop_words=english_stop_words , max_features = 10000)
feature_vector = count_vectorizer.fit(data.Review)
data_features = count_vectorizer.transform(data.Review)
features = feature_vector.get_feature_names()
features_counts = np.sum(data_features.toarray() , axis=0)
feature_counts = pd.DataFrame(dict(features = features , counts = features_counts))


In [None]:
feature_counts.sort_values("counts",ascending = False)[0:15]

In [None]:
from nltk.stem.snowball import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
  stemmed_words = [stemmer.stem(w) for w in analyzer(doc)]
  non_stop_words = [word for word in stemmed_words ]
  return non_stop_words

In [None]:
count_vectorizer = CountVectorizer(analyzer = stemmed_words , max_features = 10000)
feature_vector = count_vectorizer.fit(data.Review)
data_features = count_vectorizer.transform(data.Review)
features = feature_vector.get_feature_names()
features_counts = np.sum(data_features.toarray() , axis=0)
feature_counts = pd.DataFrame(dict(features = features , counts = features_counts))


In [None]:
feature_counts.sort_values("counts",ascending = False)[0:15]

In [None]:
data_df = pd.DataFrame(data_features.todense())
data_df.columns = features
data_df['Sentiment'] = data.Sentiment

# **Model training and analysis**

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test = train_test_split(data_features,data.Sentiment,test_size=0.2,random_state=42)

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb_clf1 = BernoulliNB()
nb_clf1.fit(X_train.toarray(),Y_train)

In [None]:
test_predicted = nb_clf1.predict(X_test.toarray())

In [None]:
from sklearn import metrics
print(metrics.classification_report(Y_test,test_predicted))

In [None]:
cm = metrics.confusion_matrix(Y_test,test_predicted)
sn.heatmap(cm, annot=True , fmt='.2f')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer = stemmed_words , max_features = 10000)
feature_vector = tfidf_vectorizer.fit(data.Review)
data_features = tfidf_vectorizer.transform(data.Review)
features = feature_vector.get_feature_names()


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print('Accuracy of the model: {0}%'.format(accuracy_score(Y_test,test_predicted)*100))

In [None]:

from sklearn.naive_bayes import GaussianNB
X_train , X_test , Y_train , Y_test = train_test_split(data_features,data.Sentiment,test_size=0.2,random_state=42)


In [None]:
nb_clf2 = GaussianNB()
nb_clf2.fit(X_train.toarray(),Y_train)

In [None]:
test_predicted = nb_clf2.predict(X_test.toarray())

In [None]:
from sklearn import metrics
print(metrics.classification_report(Y_test,test_predicted))

In [None]:
cm = metrics.confusion_matrix(Y_test,test_predicted)
sn.heatmap(cm, annot=True , fmt='.2f')

In [None]:
print('Accuracy of the model: {0}%'.format(accuracy_score(Y_test,test_predicted)*100))

In [None]:
from nltk.stem import PorterStemmer
import re
stemmer = PorterStemmer()
def get_stemmed_tokens(doc):
  all_tokens = [word for word in nltk.word_tokenize(doc)]
  clean_tokens=[]
  for each_token in all_tokens:
    if re.search('[a-zA-Z]',each_token):
        clean_tokens.append(each_token)
  stemmed_tokens = [stemmer.stem(t) for t in clean_tokens]
  return stemmed_tokens

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words ='english' , max_features = 10000 , tokenizer=get_stemmed_tokens , ngram_range=(1,2))
feature_vector = tfidf_vectorizer.fit(data.Review)
data_features = tfidf_vectorizer.transform(data.Review)
features = feature_vector.get_feature_names()

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(data_features,data.Sentiment,test_size=0.2,random_state=42)
nb_clf3 = BernoulliNB()
nb_clf3.fit(X_train.toarray(),Y_train)
test_predicted = nb_clf3.predict(X_test.toarray())
print(metrics.classification_report(Y_test,test_predicted))

In [None]:
cm = metrics.confusion_matrix(Y_test,test_predicted)
sn.heatmap(cm, annot=True , fmt='.2f')

In [None]:
print('Accuracy of the model: {0}%'.format(accuracy_score(Y_test,test_predicted)*100))

# **Predictions**

In [None]:
x = tfidf_vectorizer.transform(['I am a good boy'])
nb_clf3.predict(x)