In [3]:
import numpy as np 
import pandas as pd
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data=pd.read_csv(("E:\\code\\SENETIMENT_ANALYSIS_AMAZON\\1\\LDA_Customer_Interests_Modelling\\amazon_alexa.tsv"), sep='\t')

data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
data.shape

In [None]:
data.info()

In [None]:
 data.isnull().sum()

In [None]:
sns.set(rc={'figure.figsize':(8,8)})

ax = sns.countplot(x="rating", data=data)

for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=10)

In [None]:
sns.countplot(x="feedback", data=data)

In [None]:
print('Number of variations: '+ str(data.variation.nunique()))
print(data.variation.unique())

Lowercase letter, remove punctuation, links, square bracket

In [None]:
def clean_text(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\n', '', text)
    return text

data['verified_reviews'] = data['verified_reviews'].apply(lambda x:clean_text(x))

Removing stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop = stopwords.words('english')
data['verified_reviews'] = data['verified_reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Lemmatized words

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def lemmatize_words(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lem = ' '.join([wnl.lemmatize(word) for word in text.split()])    
    return lem

data['verified_reviews'] = data['verified_reviews'].apply(lemmatize_words)

In [None]:
import nltk
nltk.download('wordnet')


In [None]:
data.head()

Split data into train and test dataset

In [None]:
X = data['verified_reviews']
y = data['feedback']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=53)

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(count_train,y_train)

y_pred=clf.predict(count_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
# Import f1_score
from sklearn.metrics import f1_score

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


# Print the F1 score
print(f1_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

print('ROCAUC score:',roc_auc_score(y_test,y_pred))

In [None]:
###### confusion matrix  starts ######
from sklearn.metrics import accuracy_score, confusion_matrix
cm_lgr1 = confusion_matrix(y_test,y_pred) 
names = np.unique(y_pred)
sns.heatmap(cm_lgr1, square=True, annot=True, cbar=False,xticklabels=names, yticklabels=names, cmap="YlGnBu" ,fmt='g')
plt.xlabel('Truth')
plt.ylabel('Predicted')

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(count_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(count_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


# Print the F1 score
print(f1_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

print('ROCAUC score:',roc_auc_score(y_test,y_pred))

#### Predict new customer review

In [1]:
new_review='sound quality le dot microphone dont pick well model echo dot picture quality good'

In [2]:
y_pred=clf.predict(count_vectorizer.transform([new_review]))

NameError: name 'clf' is not defined

In [None]:
if(y_pred[0]==1):
    print("A Positive Review")
else:
    print("A Negative Review")
    