## Import

In [1]:
import time
import sys
import os
import pandas as pd
import fasttext
import numpy as np
import csv
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lawrence\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Create DataFrame

In [2]:
cwd = os.getcwd()
print("cwd is :",os.getcwd())
data_file_path = os.path.join(cwd, "review_rating_sent.csv")
print(data_file_path)
review_sent_df = pd.read_csv(data_file_path,index_col=False)
# Clean up "Unnamed" columns
review_sent_df = review_sent_df.loc[:, ~review_sent_df.columns.str.contains('^Unnamed')]
review_sent_df.head()

cwd is : F:\482python\Project\Group6\src_folder
F:\482python\Project\Group6\src_folder\review_rating_sent.csv


Unnamed: 0,Review,Rating,Sentiment
0,"Kurt Russell's chameleon-like performance, cou...",10,1
1,It was extremely low budget(it some scenes it ...,8,1
2,James Cagney is best known for his tough chara...,8,1
3,"Following the brilliant ""Goyôkiba"" (aka. ""Hanz...",8,1
4,One of the last classics of the French New Wav...,10,1


## Data preprocess

In [3]:
def clean_text(text):
    # Remove HTML tags, replacing <br> and <br/> tags with spaces
    text = re.sub(r'<br\s*/?>', ' ', text)

    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text (split into words)
    words = word_tokenize(text)

    # Join the words back into a string and return
    return ' '.join(words)

In [4]:
# Start timing
start_time = time.time()

# Apply the cleaning function
review_sent_df['Clean_review'] = review_sent_df['Review'].apply(clean_text)

# Drop the original 'review' column
review_sent_df = review_sent_df.drop('Review', axis=1)

# End timing and print the time taken
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

# Display the cleaned data
review_sent_df.head()

Time taken: 37.290860652923584 seconds


Unnamed: 0,Rating,Sentiment,Clean_review
0,10,1,kurt russells chameleonlike performance couple...
1,8,1,it was extremely low budgetit some scenes it l...
2,8,1,james cagney is best known for his tough chara...
3,8,1,following the brilliant goykiba aka hanzo the ...
4,10,1,one of the last classics of the french new wav...


In [5]:
print(review_sent_df.columns)
print(review_sent_df.Sentiment)

Index(['Rating', 'Sentiment', 'Clean_review'], dtype='object')
0        1
1        1
2        1
3        1
4        1
        ..
49995    0
49996    0
49997    0
49998    0
49999    0
Name: Sentiment, Length: 50000, dtype: int64


In [6]:
# Assign 'positive' or 'negative' based on the label value  
def fill_label(label):
    if label == 1:
        return 'positive'
    elif label == 0:
        return 'negative'

# Apply the label function to sentiment labels
review_sent_df['Sentiment'] = review_sent_df['Sentiment'].apply(fill_label)

# Display the cleaned data
review_sent_df.head()

Unnamed: 0,Rating,Sentiment,Clean_review
0,10,positive,kurt russells chameleonlike performance couple...
1,8,positive,it was extremely low budgetit some scenes it l...
2,8,positive,james cagney is best known for his tough chara...
3,8,positive,following the brilliant goykiba aka hanzo the ...
4,10,positive,one of the last classics of the french new wav...


In [7]:
# Check the distribution of emotional labels
sentiment_counts = review_sent_df['Sentiment'].value_counts()
print(sentiment_counts)

Sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [8]:
# Save the cleaned dataset to a new CSV file
review_sent_df.to_csv("Rating_Cleaned.csv", index=False)

## Train the FastText Model

### create train data file

In [9]:
# Load the cleaned dataset
df = pd.read_csv('Rating_Cleaned.csv')

# Split the dataset into a training set and a test set
# Use the first 40,000 entries for training and the remaining for testing
train_df = df.iloc[:40000].copy()
test_df = df.iloc[40000:].copy()

# Prepare the training data
# Prepend '__label__' to the sentiment labels
train_df['Sentiment'] = train_df['Sentiment'].apply(lambda x: '__label__' + str(x))

# Add a space before each clean review (for FastText formatting)
train_df['Clean_review'] = train_df['Clean_review'].apply(lambda x: ' ' + str(x))

# Save the training data to a text file in a tab-separated format without header
train_df[['Sentiment', 'Clean_review']].to_csv('train.txt', sep='\t', header=None, index=None)

### training

In [10]:
classifier = fasttext.train_supervised(
    'train.txt',
    lr=0.2, dim=200, epoch=25, wordNgrams=2, loss='ova' #parameter
)
print("Model training completed.")
classifier.save_model('my_senti_model.bin')
print("The training data is saved.")

Model training completed.
The training data is saved.


In [11]:
# do a simple test
from fasttext import load_model
classifier = load_model('my_senti_model.bin')
labels, prob = classifier.predict("grim instead of amusing meanspirited instead of playful boring instead of interesting it wont give you the willies but it just may gross you out or send you to sleep and it will certainly make you wonder what were they thinking", k=2)
print(labels, prob)



('__label__negative', '__label__positive') [1.00001001e+00 1.00000034e-05]


### Evaluate model performance

In [12]:
# Make predictions on the test set and calculate evaluation metrics
test_df['predict_label'] = [classifier.predict(' ' + text)[0][0].replace('__label__', '') for text in test_df['Clean_review']]
true_labels = test_df['Sentiment']
predicted_labels = test_df['predict_label']
# Calculate and output performance metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, pos_label='positive')
recall = recall_score(true_labels, predicted_labels, pos_label='positive')
f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
print(f'Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}')


Accuracy: 0.8721
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## Modify the data and retrain the model

In [13]:
from sklearn.model_selection import train_test_split

# Split the data set into training and test sets to ensure an even distribution of categories
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Sentiment'])


In [14]:
# Prefix 'label' and save the training data
train_df['Sentiment'] = train_df['Sentiment'].apply(lambda x: '__label__' + x)
train_df[['Sentiment', 'Clean_review']].to_csv('train.txt', sep='\t', header=None, index=None)

classifier = fasttext.train_supervised('train.txt', lr=0.2, dim=200, epoch=25, wordNgrams=2, loss='ova')


### Evaluate model performance

In [15]:
# Prepare test data
test_df['Clean_review'] = test_df['Clean_review'].apply(lambda x: ' ' + str(x))
test_list = test_df['Clean_review'].tolist()

# Make predictions
label_result, score_result = classifier.predict(test_list, k=2)
test_df['predict_label'] = [labels[0].replace('__label__', '') for labels in label_result]

true_labels = test_df['Sentiment']
predicted_labels = [label.replace('__label__', '') for label in test_df['predict_label']]

# Calculate accuracy: Proportion of correctly predicted observations
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate precision: Ratio of correctly predicted positive observations to the total predicted positives
precision = precision_score(true_labels, predicted_labels, average='binary', pos_label='positive')

# Calculate recall (sensitivity): Ratio of correctly predicted positive observations to all observations in actual class
recall = recall_score(true_labels, predicted_labels, average='binary', pos_label='positive')

# Calculate F1 score: Weighted average of precision and recall
f1 = f1_score(true_labels, predicted_labels, average='binary', pos_label='positive')

# Print out the evaluation metrics
print(f'Model Accuracy: {accuracy}')
print(f'Model Precision: {precision}')
print(f'Model Recall: {recall}')
print(f'Model F1 Score: {f1}')

# Generate a detailed classification report
report = classification_report(true_labels, predicted_labels)
print("Classification Report:\n", report)

Model Accuracy: 0.9115
Model Precision: 0.9105966872879665
Model Recall: 0.9126
Model F1 Score: 0.9115972430326641
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.91      0.91      5000
    positive       0.91      0.91      0.91      5000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [16]:
for pred, true in zip(predicted_labels, true_labels):
    print(f"Predicted: {pred}, True: {true}")

Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: positive, True: positive
Predicted: positive, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: negative, True: negative
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: negative, True: negative
Predicted: negative, True: negative
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: positive, True: positive
Predicted: positive, True: p

## Train the linear regression model

In [17]:
# Load the dataset
df = pd.read_csv('Rating_Cleaned.csv')

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df['Clean_review'])  # Transform reviews into TF-IDF features
y = df['Rating']  # Target variable: Ratings

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)  # Fitting the model with training data

# Make predictions on the test data
y_pred = model.predict(X_test)  # Predict ratings for the test set

# Ensure all predicted ratings are within the range 0-10
y_pred = np.clip(y_pred, 0, 10)

# Evaluate the model
# Calculate Mean Squared Error (MSE) - lower values are better
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared score - values closer to 1 indicate better fit
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Mean Squared Error: 4.805162198134217
R-squared: 0.6007543201980878


In [18]:
#save model
import joblib

joblib.dump(tfidf, 'tfidf_model.pkl')
joblib.dump(model, 'rating_model.pkl')

['rating_model.pkl']