## Import data from the csv file

In [3]:
import pandas as pd
import os

df = pd.read_csv(os.path.join('data','sentiment-analysis-dataset-google-play-app-reviews.csv'))
df = df[['content','score']] # select content and score
df.dropna()
df['sentiment'] = df['score'].apply(lambda x: 'positive' if x >= 4 else 'negative' if x <= 2 else 'neutral')
df = df[['content','sentiment']]
df = df[df['sentiment'] != 'neutral']  # Exclude neutral reviews

print(df.shape)
print(df.head())

(10942, 2)
                                             content sentiment
0  I love this app, but I do have one major gripe...  negative
1  Trash. Yes, it has some nice nifty features bu...  negative
2  OMG the UI is awful, seriously you have popup ...  negative
3  I've been using the app for a while and since ...  negative
4  Unable to register with an email. Clicking"con...  negative


## Preprocessing

In [4]:
import contractions
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download(['punkt', 'wordnet', 'stopwords','punkt_tab'])
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clear_content(content):
    '''this function will clear the text by following steps:'''
    # step 1: expand contractions 
    content = contractions.fix(content) 
    # step 2: convert text to lower 
    content = content.lower()
    # step 3: remove special characters
    content = re.sub(r'[^a-zA-Z\s]', '', content) 
    # step 4: tokenization
    tokens = word_tokenize(content)
    # step 5: lemmatization
    cleared = []
    for word in tokens:
        if (word not in stop_words) and len(word) > 2: # exclude stop words and small words like a, an, it, as
            cleared.append(lemmatizer.lemmatize(word))
    
    return ' '.join(cleared)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Naive Bayes

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

df['content'] = df['content'].apply(clear_content)

# Convert text to numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['content'])

# Naive Bayes classifier
model = MultinomialNB()
model.fit(X, df['sentiment'])

# new sentence prediction
test_sentence = ["The app is generally good but the price is too high"]
X_test = vectorizer.transform(test_sentence)
prediction = model.predict(X_test)
print(prediction)


['positive']
