In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read dataset
sample = pd.read_csv("../input/shopee-sentiment-analysis/sampleSubmission.csv")
test_dataset = pd.read_csv("../input/shopee-sentiment-analysis/test.csv")
train_dataset = pd.read_csv("../input/shopee-sentiment-analysis/train.csv")

# Check dimension
print("Dimension Shape for test", test_dataset.shape)
print("Dimension Shape for train", train_dataset.shape)
print("Dimension Shape for sample", sample.shape)

In [None]:
# Remove Punctuation
train_dataset["review"] = train_dataset['review'].str.replace('[?.!,¿()-/]',' \1 ')
train_dataset["review"] = train_dataset['review'].str.replace('[^\w\s]',' ')
train_dataset["review"] = train_dataset['review'].str.replace('[" "]+', " ")

display(train_dataset)

# Stemming Process
from nltk.stem.porter import PorterStemmer # Stemming
from nltk.stem import WordNetLemmatizer

# # Download Stopwords 
import nltk
nltk.download('stopwords')

# # Allocate Stopwords to the variable
from nltk.corpus import stopwords
en_stops = list(stopwords.words('english'))

# Plotting to the libraries
lemmatizer = WordNetLemmatizer() # Lemmatization
stemmer = PorterStemmer() # Stemming

# Splitting the Sentence into words
train_dataset["review"] = train_dataset["review"].str.split()

# # # # Removing Stopwords
train_dataset["review"] = train_dataset["review"].apply(lambda x: [word.lower() for word in x if word not in (en_stops)])
display(train_dataset["review"])

# Removing the Stemming 
train_dataset["review"] = train_dataset["review"].apply(lambda x: ([stemmer.stem(y) for y in x]))
# Removing Lemmatization
train_dataset["review"] = train_dataset["review"].apply(lambda x: ([lemmatizer.lemmatize(y) for y in x]))

display(train_dataset["review"])

In [None]:
# Train Test Split in Python in Training Only  
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_dataset["review"], train_dataset["rating"], test_size = 0.3, random_state = 41)

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

X_train =[" ".join(review) for review in X_train]

# TFID Vectorizer
cv = TfidfVectorizer()
df_xcv = cv.fit_transform(X_train)

# Using Naive Bayes for the Model
naive_bayes = BernoulliNB()
SVC = SVC(kernel = 'rbf', C = 1000, random_state = 0)
clf = RandomForestClassifier(max_depth=20, random_state=0, criterion = 'entropy', n_estimators = 250)


# Fitting into from X to Y
naive_bayes.fit(df_xcv, y_train)
SVC.fit(df_xcv, y_train)
clf.fit(df_xcv, y_train)

In [None]:
# Splitting the Sentence into words
X_test =[" ".join(review) for review in X_test]

# TFID Vectorizer
df_x_test = cv.transform(X_test)
y_pred_naive = naive_bayes.predict(df_x_test)
y_pred_svm = SVC.predict(df_x_test)
y_pred_random = clf.predict(df_x_test)

In [None]:
# Import Confusion Matrix 
from sklearn.metrics import confusion_matrix
print("Confusion Matrix as follows : \n")
print(confusion_matrix(y_test, y_pred_svm))

# Import Accuracy Score 
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("The Accuracy Score in Naive Bayes is : ", accuracy_score(y_test, y_pred_naive))
print("The Precision Score in Naive Bayes is : ", precision_score(y_test, y_pred_naive, average = 'weighted'))
print("The Recall Score in Naive Bayes is : ", recall_score(y_test, y_pred_naive,average = 'weighted'))


# In the SVM 
print("The Accuracy Score in SVM is : ", accuracy_score(y_test, y_pred_svm))
print("The Precision Score in SVM is : ", precision_score(y_test, y_pred_svm, average = 'weighted'))
print("The Recall Score in SVM is : ", recall_score(y_test, y_pred_svm,average = 'weighted'))


# In the Random Forest
print("The Accuracy Score in Random Forest is : ", accuracy_score(y_test, y_pred_random))
print("The Precision Score in Random Forest is : ", precision_score(y_test, y_pred_random, average = 'weighted'))
print("The Recall Score in Random Forest is : ", recall_score(y_test, y_pred_random,average = 'weighted'))

In [None]:
# Target
test_dataset_count = cv.transform(test_dataset["review"])
test_dataset["rating"] = SVC.predict(test_dataset_count)

# Export to a csv file
test_dataset.loc[:, ['review_id', 'rating']].to_csv(
    'submission.csv', index=False, header=True
)

# Using Different Model Approach 
### Neural Network


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
from keras.utils import np_utils
# Change the Format of Y train and Y test 
dummy_y_train = np_utils.to_categorical(y_train)
dummy_y_test = np_utils.to_categorical(y_test)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

embedding_dim = 50
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 64.
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim))

# Add a LSTM layer with 128 internal units.
model.add(layers.Dense(100,activation = 'relu'))

# Add Droping Out
model.add(layers.Dropout(0.25))

# Add a Dense layer with 10 units.
model.add(layers.Dense(50,activation = 'softmax'))

# Add Droping Out
model.add(layers.Dropout(0.25))

# Add a Dense layer with 10 units.
model.add(layers.Dense(25,activation = 'softmax'))

# Flatten the Model
model.add(layers.Flatten())

model.summary()

In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer="adam",
    metrics=["accuracy"],
)

batch_size = 64
model.fit(
    X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=200
)