In [1]:
import pandas as pd
import numpy as np
from io import StringIO
import matplotlib.pyplot as plt
#Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Here we will extract both the Indian and Malay/Singaporean corpora. 
#We will extract the relevant columns and clean the data for preprocessing
#Then we join all the rows together to train each respective model
#Then we will do the reference model on some random corpus
#Pipeline:
#Get sentiment pair, pass through dialect identifier, then if malay/indian pass to their respective sentiment identifier
#if not then pass to reference sentiment identifier
#Also do a control passthrough where you just chuck the sentiment pair straight into the reference sentiment identifier
#We should see an improvement over the control if my theory is correct

In [83]:
#Read the Malay file
filepath = "../Corpora/Sentiment-Malay/data_cleaned/GoogleReview_data_cleaned.csv"
df = pd.read_csv(filepath, sep=",", engine="python", encoding="ISO-8859-1", nrows=3000)
#Drop unused columns
df=df.drop(["Author", "Restaurant", "Location"], axis=1)
#Remove rows that contain review of 3.0 as these contain inconsistent sentiment
df=df[df["Rating"]!=3.0]
print(type(df))
df.loc[df["Rating"] == 4.0, "Rating"] = "positive"
df.loc[df["Rating"] == 5.0, "Rating"] = "positive"
df.loc[df["Rating"] == 2.0, "Rating"] = "negative"
df.loc[df["Rating"] == 1.0, "Rating"] = "negative"
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)
print("Size: ",df.size, "\nHead:\n", df.head())

<class 'pandas.core.frame.DataFrame'>
Size:  5336 
Head:
      Rating                                             Review
0  positive  Came here for the High Tea Great service espec...
1  negative  5 stars for the service even though some of th...
2  negative  Hi thank you for your service But i feel so so...
3  negative  I have the worse buffer dinner ever so far The...
4  positive  Thats are Known 5 Elmark  9H72   KDK  3 K14Y9 ...


In [91]:
#Read the Indian file
filepath = "../Corpora/Sentiment-Indian/amazon_vfl_reviews.csv"
df = pd.read_csv(filepath, sep=",", engine="python", encoding="ISO-8859-1", nrows=2000)
df=df.drop(["asin", "name", "date"], axis=1)
df=df.rename(columns={"rating":"Rating"})
df=df.rename(columns={"review":"Review"})
df=df[df["Rating"]!=3.0]
df.loc[df["Rating"] == 4.0, "Rating"] = "positive"
df.loc[df["Rating"] == 5.0, "Rating"] = "positive"
df.loc[df["Rating"] == 2.0, "Rating"] = "negative"
df.loc[df["Rating"] == 1.0, "Rating"] = "negative"
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)
print("Size: ",df.size, "\nHead:\n", df.head())

Size:  3740 
Head:
      Rating                                             Review
0  negative  I bought this hair oil after viewing so many g...
1  positive  Used This Mama Earth Newly Launched Onion Oil ...
2  negative  So bad productMy hair falling increase too muc...
3  negative  Product just smells similar to navarathna hair...
4  positive  I have been trying different onion oil for my ...


In [131]:
#Read the Reference file

# Read the text file and split each line into label and feature
#with open(filepath, 'r', encoding="ISO-8859-1") as file:
    #data = [line.strip().split() for line in file]

# Create a DataFrame from the data, specifying the order of the columns
#df = pd.DataFrame(data, columns=['label', 'feature'])


filepath = "../Corpora/Sentiment-Reference/test.txt.txt"
df = pd.DataFrame()
df["X"] = pd.read_csv(filepath, sep="\t", engine="python", encoding="ISO-8859-1", nrows=5000)
df[["Rating", "Review"]] = df["X"].str.split(" ", n=1, expand=True)
df.drop("X", axis=1, inplace=True)
df.loc[df["Rating"] == "__label__1", "Rating"] = "negative"
df.loc[df["Rating"] == "__label__2", "Rating"] = "positive"
df.head()

Unnamed: 0,Rating,Review
0,positive,One of the best game music soundtracks - for a...
1,negative,Batteries died within a year ...: I bought thi...
2,positive,"works fine, but Maha Energy is better: Check o..."
3,positive,Great for the non-audiophile: Reviewed quite a...
4,negative,DVD Player crapped out after one year: I also ...


In [132]:
#Train the model
#Make Malay, Indian, and Reference Model
#Malay
tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=[1,3])
dfs_x = df["Review"]
dfs_y = df["Rating"]
x_train, x_test, y_train, y_test = train_test_split(dfs_x, dfs_y, test_size=0.2, random_state=4)
print(x_test)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)
#clf = MultinomialNB()
#clf = SVC(kernel="sigmoid")
#clf = LinearSVC()
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))

2175    Decent: After having a lot of trouble finding ...
3156    I never thought Meryl Streep could give a "bad...
337     Great eye candy, but compares poorly to the fi...
444     Kingston Technology KVR133X64C3/ 256 PC133 256...
2334    Could Have Been A Fun Movie For The Kids: With...
                              ...                        
1862    Batman Scores A Win: Christian Bale is great a...
1028    Memory Lane: I saw the movie based on this boo...
4430    OUTSTANDING HORROR MOVIE- AN ORIGINAL: This ha...
3025    This book is thorough but poorly written.: It ...
1807    urethane coating, how about all urethane: It i...
Name: Review, Length: 1000, dtype: object
Accuracy: 82.30%


In [138]:
with open('../Corpora/Sentiment-Reference/test.txt.txt', 'r') as file:
    lines = file.readlines()

# Keep every other line
lines = lines[::2]

# Write the modified lines back to a new file
with open('test.txt.txt', 'w') as file:
    file.writelines(lines)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 300: character maps to <undefined>