In [1]:
import pandas as pd
import numpy as np
from io import StringIO
import matplotlib.pyplot as plt
#Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
######################################################
############## DIALECT IDENTIFICATION ################
######################################################

In [15]:
dfs = []
datasets = []
labels = ["HongKong", "Philippines", "Singapore", "Canada", "India"]
for dialect in range(len(labels)):
    comb=None
    for i in range(300):
        with open("..\CleanCorpora\{0}\_{0}CorporaCombined{1}.txt".format(labels[dialect], i), "r", encoding='UTF-8') as file:
            text = file.read()
        df = pd.DataFrame({"Label": labels[dialect], "Document": [text]})
        dfs.append(df)
    comb = pd.concat(dfs, ignore_index=True)
    datasets.append(comb)
#Concatenate all 4 dialects
dfs = pd.concat(datasets, ignore_index=True)

0
1
2
3
4


In [32]:
#TEST THAT DIALECT IDENTIFICATION WORKS CORRECTLY
tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
dfs_x = dfs["Document"]
dfs_y = dfs["Label"]
x_train, x_test, y_train, y_test = train_test_split(dfs_x, dfs_y, test_size=0.2, random_state=4)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)
dialect_clf = MultinomialNB()
#dialect_clf = SVC(kernel="sigmoid")
#dialect_clf = LinearSVC()
#dialect_clf = RandomForestClassifier()
dialect_clf.fit(x_train, y_train)
y_pred = dialect_clf.predict(x_test)

In [17]:
#Accuracy, Feature importance, and Confusion Matrix
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))
print("==========")

Accuracy: 99.78%


In [None]:
######################################################
################ SENTIMENT ANALYSIS ##################
######################################################

In [21]:
#Malay -> Indian -> Reference
sentiment_comb = []

In [22]:
#Read the Malay file
sentiment_comb = []
filepath = "../Corpora/Sentiment-Malay/data_cleaned/GoogleReview_data_cleaned.csv"
df = pd.read_csv(filepath, sep=",", engine="python", encoding="ISO-8859-1", nrows=3000)
#Drop unused columns
df=df.drop(["Author", "Restaurant", "Location"], axis=1)
#Remove rows that contain review of 3.0 as these contain inconsistent sentiment
df=df[df["Rating"]!=3.0]
print(type(df))
df.loc[df["Rating"] == 4.0, "Rating"] = "positive"
df.loc[df["Rating"] == 5.0, "Rating"] = "positive"
df.loc[df["Rating"] == 2.0, "Rating"] = "negative"
df.loc[df["Rating"] == 1.0, "Rating"] = "negative"
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)
print("Size: ",df.size, "\nHead:\n", df.head())
sentiment_comb.append(df)

<class 'pandas.core.frame.DataFrame'>
Size:  5336 
Head:
      Rating                                             Review
0  positive  Came here for the High Tea Great service espec...
1  negative  5 stars for the service even though some of th...
2  negative  Hi thank you for your service But i feel so so...
3  negative  I have the worse buffer dinner ever so far The...
4  positive  Thats are Known 5 Elmark  9H72   KDK  3 K14Y9 ...


In [23]:
#Read the Indian file
sentiment_comb = []
filepath = "../Corpora/Sentiment-Indian/amazon_vfl_reviews.csv"
df = pd.read_csv(filepath, sep=",", engine="python", encoding="ISO-8859-1", nrows=2000)
df=df.drop(["asin", "name", "date"], axis=1)
df=df.rename(columns={"rating":"Rating"})
df=df.rename(columns={"review":"Review"})
df=df[df["Rating"]!=3.0]
df.loc[df["Rating"] == 4.0, "Rating"] = "positive"
df.loc[df["Rating"] == 5.0, "Rating"] = "positive"
df.loc[df["Rating"] == 2.0, "Rating"] = "negative"
df.loc[df["Rating"] == 1.0, "Rating"] = "negative"
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)
print("Size: ",df.size, "\nHead:\n", df.head())
sentiment_comb.append(df)

Size:  3740 
Head:
      Rating                                             Review
0  negative  I bought this hair oil after viewing so many g...
1  positive  Used This Mama Earth Newly Launched Onion Oil ...
2  negative  So bad productMy hair falling increase too muc...
3  negative  Product just smells similar to navarathna hair...
4  positive  I have been trying different onion oil for my ...


In [24]:
#Read the Reference file
sentiment_comb = []
filepath = "../Corpora/Sentiment-Reference/test.txt.txt"
df = pd.DataFrame()
df["X"] = pd.read_csv(filepath, sep="\t", engine="python", encoding="ISO-8859-1", nrows=5000)
df[["Rating", "Review"]] = df["X"].str.split(" ", n=1, expand=True)
df.drop("X", axis=1, inplace=True)
df.loc[df["Rating"] == "__label__1", "Rating"] = "negative"
df.loc[df["Rating"] == "__label__2", "Rating"] = "positive"
df.head()
sentiment_comb.append(df)

In [None]:
y_test_comb = []
x_test_comb = []

In [66]:
#Train the model
#Malay
y_test_comb = []
x_test_comb = []
tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
dfs_x = df["Review"]
dfs_y = df["Rating"]
x_train, x_test, y_train, y_test = train_test_split(dfs_x, dfs_y, test_size=0.2, random_state=4)
y_test_comb.append(y_test)
x_test_comb.append(x_test)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)
malay_clf = MultinomialNB()
#malay_clf = SVC(kernel="sigmoid")
#malay_clf = LinearSVC()
#malay_clf = RandomForestClassifier()
malay_clf.fit(x_train, y_train)
#y_pred = malaya_clf.predict(x_test)
#acc = accuracy_score(y_test, y_pred)
#print("Accuracy: {:.2f}%".format(acc * 100))
y_test_comb.append(y_test)

In [67]:
#Train the model
#Indian
y_test_comb = []
x_test_comb = []
tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
dfs_x = df["Review"]
dfs_y = df["Rating"]
x_train, x_test, y_train, y_test = train_test_split(dfs_x, dfs_y, test_size=0.2, random_state=4)
y_test_comb.append(y_test)
x_test_comb.append(x_test)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)
indian_clf = MultinomialNB()
#malay_clf = SVC(kernel="sigmoid")
#malay_clf = LinearSVC()
#malay_clf = RandomForestClassifier()
indian_clf.fit(x_train, y_train)
#y_pred = malaya_clf.predict(x_test)
#acc = accuracy_score(y_test, y_pred)
#print("Accuracy: {:.2f}%".format(acc * 100))


MultinomialNB()

In [68]:
#Train the model
#Reference
y_test_comb = []
x_test_comb = []
tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
dfs_x = df["Review"]
dfs_y = df["Rating"]
x_train, x_test, y_train, y_test = train_test_split(dfs_x, dfs_y, test_size=0.2, random_state=4)
y_test_comb.append(y_test)
x_test_comb.append(x_test)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)
ref_clf = MultinomialNB()
#malay_clf = SVC(kernel="sigmoid")
#malay_clf = LinearSVC()
#malay_clf = RandomForestClassifier()
ref_clf.fit(x_train, y_train)
#y_pred = malaya_clf.predict(x_test)
#acc = accuracy_score(y_test, y_pred)
#print("Accuracy: {:.2f}%".format(acc * 100))

MultinomialNB()

In [70]:
#print((x_test_comb).shape)
#tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
#x_test_comb[0] = tfidf.fit_transform(x_test_comb[0])
#y_test_comb[0] = tfidf.transform(y_test_comb[0])
#testpred = dialect_clf.predict(test_input)

IndexError: list index out of range