## IMPORTING REQUIRED LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

## LOADING THE DATASET

In [None]:
data = pd.read_csv("reviews1.csv")
data = data.drop([data.columns[0]], axis=1)
data.values[:5].tolist()

[['https://play-lh.googleusercontent.com/-2Zyc1Oz-mys/AAAAAAAAAAI/AAAAAAAAAAA/AMZuucmNttE5TrkCrUuRraul5uIVUVSsTg/photo.jpg',
  'Unable to use it as it won\'t work with my Google calendar. Tech support is horrible - "Tali" keeps asking for a screenshot of the native calendar she thinks came with the phone. It\'s a Motorola Edge that is all stock Android, thus Google calendar. They don\'t get it, and refuse to escalate my request for help to a higher level. EDIT: Now your team has a copy of the chat from Motorola explaining that there is no "native" calendar app other than Google. Since you know my case in this review area, your tech dept must be tiny. I really wanted to like this app, but it\'s very buggy with the Google calendar app it\'s advertised to work so well with.',
  1,
  13,
  '5.7.0.20',
  "As our team explained profusely, we sync data from the device's native calendar, and not directly from Google. We have worked with many Motorola users in the past, and had no issue with re

In [None]:
print("Duplicated values: " , data.duplicated().sum())

data.drop_duplicates(inplace=True)
data = data.dropna(axis=0)
print(data.info())

Duplicated values:  1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7042 entries, 0 to 16116
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   userImage             7042 non-null   object
 1   content               7042 non-null   object
 2   score                 7042 non-null   int64 
 3   thumbsUpCount         7042 non-null   int64 
 4   reviewCreatedVersion  7042 non-null   object
 5   replyContent          7042 non-null   object
 6   sortOrder             7042 non-null   object
dtypes: int64(2), object(5)
memory usage: 440.1+ KB
None


## CLEANING THE DATASET

In [None]:
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer,PorterStemmer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
st = PorterStemmer()
lem = WordNetLemmatizer()

def is_alpha(word):
    for part in word.split('-'):
        if not part.isalpha():
            return False

    return True

def clean_dataset(text):
    text = re.sub(r'http\S+', '', text) # removing links
    text = re.sub(r'\\n', ' ', text) # removing \\n
    text = re.sub(r"\s*#\S+", "", text) # removing hash tags
    text = re.sub(r"\s*@\S+", "", text) # removing @
    text = text.lower()
    words = [word for word in word_tokenize(text) if is_alpha(word)]
    #words = [st.stem(word) for word in words]
    words = [lem.lemmatize(word) for word in words]

    # text = " ".join([word for word in text.split(" ") if is_alpha(word)])
    # text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    words = [w for w in words if not w in stop_words]
    text = " ".join(words)

    return text.strip()

data.insert(len(data.columns)-1, "cleaned_content", data['content'].apply(clean_dataset))
data.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,userImage,content,score,thumbsUpCount,reviewCreatedVersion,replyContent,cleaned_content,sortOrder
0,https://play-lh.googleusercontent.com/-2Zyc1Oz...,Unable to use it as it won't work with my Goog...,1,13,5.7.0.20,"As our team explained profusely, we sync data ...",unable use wo work google calendar tech suppor...,most_relevant
1,https://play-lh.googleusercontent.com/a-/AOh14...,I downloaded this because of the cross platfor...,1,81,5.7.0.20,The Premium ad only shows up when first openin...,downloaded cross platform syncing purchase app...,most_relevant
2,https://play-lh.googleusercontent.com/a-/AOh14...,I am starting to hate this app! Every time the...,1,67,5.7.0.10,"Hi, please note that these issues usually are ...",starting hate app every time update start exac...,most_relevant
3,https://play-lh.googleusercontent.com/-97UNn5D...,"Liked Any.do at first, enough that I even sign...",1,102,5.7.0.10,Please note that Any.do integrates directly wi...,liked first enough even signed premium could s...,most_relevant
4,https://play-lh.googleusercontent.com/a-/AOh14...,They made me sign up for premium just to have ...,1,2,5.7.0.20,"We never require users to upgrade, as 90% of t...",made sign premium recurring alert even hourly ...,most_relevant


## POS TAGGING

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import pandas as pd


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
data = pd.read_excel("/content/cleangoogle.xlsx")
data

Unnamed: 0,userImage,content,score,thumbsUpCount,reviewCreatedVersion,replyContent,cleaned_content,sortOrder
0,https://play-lh.googleusercontent.com/-2Zyc1Oz...,Unable to use it as it won't work with my Goog...,1,13,5.7.0.20,"As our team explained profusely, we sync data ...",unable use wo work google calendar tech suppor...,most_relevant
1,https://play-lh.googleusercontent.com/a-/AOh14...,I downloaded this because of the cross platfor...,1,81,5.7.0.20,The Premium ad only shows up when first openin...,downloaded cross platform syncing purchase app...,most_relevant
2,https://play-lh.googleusercontent.com/a-/AOh14...,I am starting to hate this app! Every time the...,1,67,5.7.0.10,"Hi, please note that these issues usually are ...",starting hate app every time update start exac...,most_relevant
3,https://play-lh.googleusercontent.com/-97UNn5D...,"Liked Any.do at first, enough that I even sign...",1,102,5.7.0.10,Please note that Any.do integrates directly wi...,liked first enough even signed premium could s...,most_relevant
4,https://play-lh.googleusercontent.com/a-/AOh14...,They made me sign up for premium just to have ...,1,2,5.7.0.20,"We never require users to upgrade, as 90% of t...",made sign premium recurring alert even hourly ...,most_relevant
...,...,...,...,...,...,...,...,...
7037,https://play-lh.googleusercontent.com/-VWQ9ItI...,Why when I put repeat it does not show on the ...,4,1,5.2,"Hi, would you please contact us at planner.a@a...",put repeat doe show next day ca copy task diff...,newest
7038,https://play-lh.googleusercontent.com/a-/AOh14...,Works good,4,0,5.1,"Hi, if you have any ideas about improving the ...",work good,newest
7039,https://play-lh.googleusercontent.com/a-/AOh14...,Its very effective tool for daily tasks and ev...,4,13,5.1,"Hi, if you have any ideas about improving the ...",effective tool daily task event plus long term...,newest
7040,https://play-lh.googleusercontent.com/-epJSlRi...,Next best thing to using F. Covey when I was w...,4,0,5.0.5,"Hi, if you have any ideas about improving the ...",next best thing using covey wa working,newest


In [None]:
tagged_data = []


In [None]:
for sentence in data['cleaned_content']:
    # Check if the sentence is not NaN (not missing)
    if pd.notna(sentence):
        words = word_tokenize(sentence)
        pos_tags = pos_tag(words)
        tagged_data.append(pos_tags)
    else:
        # Handle missing values (e.g., you can choose to add a placeholder)
        tagged_data.append([])  # Empty list or any other desired placeholder


In [None]:
data['pos_tags'] = tagged_data
data

## TFIDF 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
data['cleaned_content'].fillna('', inplace=True)

In [None]:
# Create a TF-IDF vectorizer instance
tfidf_vectorizer = TfidfVectorizer()

In [None]:
# Fit and transform the 'cleaned_content' data to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_content'])


In [None]:
# Get feature names (words) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()


In [None]:
# Create a DataFrame to display the TF-IDF values
tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=feature_names)

In [None]:
print(tfidf_df)

       aa  aaah  aak  aap   ab  abandon  abandoned  abc  aber  abhorrent  ...  \
0     0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
1     0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
2     0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
3     0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
4     0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
...   ...   ...  ...  ...  ...      ...        ...  ...   ...        ...  ...   
7037  0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
7038  0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
7039  0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
7040  0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   
7041  0.0   0.0  0.0  0.0  0.0      0.0        0.0  0.0   0.0        0.0  ...   

      ولی  پولیه  چطوری   چ

In [2]:
import pandas as pd
data = pd.read_excel("/content/tfidf.xlsx")
data

Unnamed: 0,userName,content,thumbsUpCount,reviewCreatedVersion,replyContent,cleaned_content,pos_tags
0,H G,Unable to use it as it won't work with my Goog...,13,5.7.0.20,"As our team explained profusely, we sync data ...",unable use wo work google calendar tech suppor...,"[('unable', 'JJ'), ('use', 'NN'), ('wo', 'MD')..."
1,Michelle Armstrong,I downloaded this because of the cross platfor...,81,5.7.0.20,The Premium ad only shows up when first openin...,downloaded cross platform syncing purchase app...,"[('downloaded', 'VBN'), ('cross', 'NN'), ('pla..."
2,Yiğitcan Sümbelli,I am starting to hate this app! Every time the...,67,5.7.0.10,"Hi, please note that these issues usually are ...",starting hate app every time update start exac...,"[('starting', 'VBG'), ('hate', 'NN'), ('app', ..."
3,Jon Pember,"Liked Any.do at first, enough that I even sign...",102,5.7.0.10,Please note that Any.do integrates directly wi...,liked first enough even signed premium could s...,"[('liked', 'VBN'), ('first', 'JJ'), ('enough',..."
4,Harry Wexler Jr. MBA,They made me sign up for premium just to have ...,2,5.7.0.20,"We never require users to upgrade, as 90% of t...",made sign premium recurring alert even hourly ...,"[('made', 'VBN'), ('sign', 'NN'), ('premium', ..."
...,...,...,...,...,...,...,...
16381,Arifa Kanaan,Love this app so organized,0,5.0.2,0,love app organized,"[('love', 'VB'), ('app', 'NN'), ('organized', ..."
16382,Cindi Peeff,Great place to keep notes,0,5.0.2,0,great place keep note,"[('great', 'JJ'), ('place', 'NN'), ('keep', 'V..."
16383,Rashaad Jones,This app helps me compartmentalize my hectic l...,0,5.0.2,0,app help compartmentalize hectic life thank gr...,"[('app', 'NN'), ('help', 'NN'), ('compartmenta..."
16384,Amanda Seckman,Love it!,0,5.0.2,0,love,"[('love', 'NN')]"


## TEST-TRAIN SPLIT 

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


In [11]:
X_train.shape, X_test.shape

((105, 4), (45, 4))

## RANDOM FOREST

In [16]:
from sklearn.ensemble import RandomForestClassifier

# instantiate the classifier
rfc = RandomForestClassifier(random_state=0)

# fit the model
rfc.fit(X_train, y_train)

# Predict the Test set results
y_pred = rfc.predict(X_test)

# Check accuracy score
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.9778


## KNN


In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [18]:
# import KNeighbors ClaSSifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
# instantiate the model
knn = KNeighborsClassifier(n_neighbors=3)
# fit the model to the training set
knn.fit(X_train, y_train)

In [19]:
y_pred = knn.predict(X_test)

y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [20]:
# probability of getting output as 2 - benign cancer

knn.predict_proba(X_test)[:,0]

array([0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1.])

In [21]:
# probability of getting output as 4 - malignant cancer

knn.predict_proba(X_test)[:,1]

array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       0.        , 1.        , 1.        , 1.        , 0.66666667,
       0.        , 0.66666667, 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       0.        , 1.        , 0.        , 0.33333333, 0.        ,
       1.        , 0.        , 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [23]:
y_pred_train = knn.predict(X_train)

In [24]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.9619


## NAIVE BAYES


In [28]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [34]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)

In [35]:
y_pred_train = gnb.predict(X_train)

y_pred_train

array([1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1,
       2, 0, 0, 2, 1, 0, 0, 2, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1,
       2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [36]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.9429


## Logistic Regression


In [44]:
import sklearn
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [48]:
print("Accuracy score:")
sklearn.metrics.accuracy_score(y_test,y_pred)

Accuracy score:


0.9777777777777777

## Decision Tree


In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(X_train, y_train)

In [77]:
y_pred_gini = clf_gini.predict(X_test)

In [80]:
y_pred_train_gini = clf_gini.predict(X_train)

y_pred_train_gini

array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1,
       2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1,
       2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [81]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

Training-set accuracy score: 0.9810
