In [6]:
# M6W3.


# Import required libraries.
import bz2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from langdetect import detect


# Load and decode train file.
train_file = bz2.BZ2File(r"C:\Users\deon.archary\OneDrive - TO70\Bureaublad\M6W3\train.ft.txt.bz2", 'rb')
lines = [x.decode('utf-8') for x in train_file.readlines()]


# Extract reviews and labels.
score_review_list = [l.replace('__label__', '').split(' ', 1) for l in lines]
df_train = pd.DataFrame(score_review_list, columns=['score', 'review'])


# Load and decode test file.
test_file = bz2.BZ2File(r"C:\Users\deon.archary\OneDrive - TO70\Bureaublad\M6W3\test.ft.txt.bz2", 'rb')
lines_test = [x.decode('utf-8') for x in test_file.readlines()]


# Extract reviews and labels from test data.
score_review_list_test = [l.replace('__label__', '').split(' ', 1) for l in lines_test]
df_test = pd.DataFrame(score_review_list_test, columns=['score', 'review'])


# Add n_tokens feature.
df_train['n_tokens'] = df_train['review'].apply(lambda x: len(x.split()))
df_test['n_tokens'] = df_test['review'].apply(lambda x: len(x.split()))


# Add language feature
def detect_language(x):
    try:
        if not x.strip():
            return 'en'
        return detect(x)
    except:
        return 'en'

df_train['language'] = df_train['review'].apply(detect_language)
df_test['language'] = df_test['review'].apply(detect_language)


# Convert language feature to binary.
df_train['language'] = np.where(df_train['language'] == 'en', 1, 0)
df_test['language'] = np.where(df_test['language'] == 'en', 1, 0)


# Basic EDA, Missing Data and Label checks.
print("Training Set Information:")
print(df_train.info())
print("\nTest Set Information:")
print(df_test.info())


print("\nMissing values in Training Set:")
print(df_train.isnull().sum())
print("\nMissing values in Test Set:")
print(df_test.isnull().sum())


print("\nUnique labels in Training Set:")
print(df_train['score'].unique())
print("\nUnique labels in Test Set:")
print(df_test['score'].unique())


# Encode labels.
le = LabelEncoder()
y_train = le.fit_transform(df_train['score'])
y_test = le.transform(df_test['score'])


# Bag of words.
vectorizer = CountVectorizer(max_features=1000)
X_train_transformed = vectorizer.fit_transform(df_train['review'])
X_test_transformed = vectorizer.transform(df_test['review'])


# Add the other features.
X_train_transformed = np.hstack([X_train_transformed.toarray(), df_train[['n_tokens', 'language']].values])
X_test_transformed = np.hstack([X_test_transformed.toarray(), df_test[['n_tokens', 'language']].values])


# Check the number of columns.
print(f"\nNumber of columns in the transformed Training Set: {X_train_transformed.shape[1]}")
print(f"Number of columns in the transformed Test Set: {X_test_transformed.shape[1]}")


# Train model.
model = RandomForestClassifier()
model.fit(X_train_transformed, y_train)


# Predict and evaluate.
predictions_train = model.predict(X_train_transformed)
predictions_test = model.predict(X_test_transformed)


print(f'Training Accuracy: {accuracy_score(y_train, predictions_train)}')
print(f'Training Confusion Matrix:\n{confusion_matrix(y_train, predictions_train)}')


print(f'Test Accuracy: {accuracy_score(y_test, predictions_test)}')
print(f'Test Confusion Matrix:\n{confusion_matrix(y_test, predictions_test)}')


# Evaluate feature importance.
feature_importances = model.feature_importances_
features = vectorizer.get_feature_names() + ['n_tokens', 'language']


feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
print("\nFeature Importances:")
print(feature_importances_df.sort_values(by='Importance', ascending=False))


# - The sentiment of the review is done through a classification model. In this case, I used a Random 
#   Forest Classifier.

# - I transformed the language feature to a binary feature, where 1 indicates English and 0 indicates other 
#   languages.

# - The Count Vectorizer is used to convert the review text into a numerical format that can be used for 
#   modeling. The result is a bag of words representation of the review. This is combined with the other 
#   created features, 'n_tokens' and 'language', for the training and test sets.

# - The model is trained using the transformed training set and evaluated using the transformed test set. 
#   The accuracy score and confusion matrix are used for performance evaluation. The accuracy score is used
#   for classification tasks and gives a understanding of how well the model performs. The confusion matrix 
#   gives better details of the model's performance.

#   I do not have enough memory available, so I have reduced the number of features to 500 in version 2 
#   below. There is also the possibility of using different approachs to processing the data when faced
#   with limited memory.


Training Set Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   score     object
 1   review    object
 2   n_tokens  int64 
 3   language  int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 96.1+ MB
None

Test Set Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   score     400000 non-null  object
 1   review    400000 non-null  object
 2   n_tokens  400000 non-null  int64 
 3   language  400000 non-null  int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 10.7+ MB
None

Missing values in Training Set:
score       0
review      0
n_tokens    0
language    0
dtype: int64

Missing values in Test Set:
score       0
review      0
n_tokens    0
language    0
dtype: int64

Unique labels in Training Set:

MemoryError: Unable to allocate 26.9 GiB for an array with shape (3600000, 1002) and data type int64

In [5]:
# M6W3.

# Version 2 for limited memory, features are limited to 500 instead of 1000.

# Import required libraries.
import bz2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from langdetect import detect


# Load and decode train file.
train_file = bz2.BZ2File(r"C:\Users\deon.archary\OneDrive - TO70\Bureaublad\M6W3\train.ft.txt.bz2", 'rb')
lines = [x.decode('utf-8') for x in train_file.readlines()]


# Extract reviews and labels.
score_review_list = [l.replace('__label__', '').split(' ', 1) for l in lines]
df_train = pd.DataFrame(score_review_list, columns=['score', 'review'])


# Load and decode test file.
test_file = bz2.BZ2File(r"C:\Users\deon.archary\OneDrive - TO70\Bureaublad\M6W3\test.ft.txt.bz2", 'rb')
lines_test = [x.decode('utf-8') for x in test_file.readlines()]


# Extract reviews and labels from test data.
score_review_list_test = [l.replace('__label__', '').split(' ', 1) for l in lines_test]
df_test = pd.DataFrame(score_review_list_test, columns=['score', 'review'])


# Add n_tokens feature.
df_train['n_tokens'] = df_train['review'].apply(lambda x: len(x.split()))
df_test['n_tokens'] = df_test['review'].apply(lambda x: len(x.split()))


# Add language feature.
def detect_language(x):
    try:
        if not x.strip():
            return 'en'
        return detect(x)
    except:
        return 'en'

df_train['language'] = df_train['review'].apply(detect_language)
df_test['language'] = df_test['review'].apply(detect_language)


# Convert language feature to binary.
df_train['language'] = np.where(df_train['language'] == 'en', 1, 0)
df_test['language'] = np.where(df_test['language'] == 'en', 1, 0)


# Basic EDA, Missing Data and Label checks.
print("Training Set Information:")
print(df_train.info())
print("\nTest Set Information:")
print(df_test.info())


print("\nMissing values in Training Set:")
print(df_train.isnull().sum())
print("\nMissing values in Test Set:")
print(df_test.isnull().sum())


print("\nUnique labels in Training Set:")
print(df_train['score'].unique())
print("\nUnique labels in Test Set:")
print(df_test['score'].unique())


# Encode labels.
le = LabelEncoder()
y_train = le.fit_transform(df_train['score'])
y_test = le.transform(df_test['score'])


# Bag of words.
vectorizer = CountVectorizer(max_features=500)
X_train_transformed = vectorizer.fit_transform(df_train['review'])
X_test_transformed = vectorizer.transform(df_test['review'])


# Add the other features.
X_train_transformed = np.hstack([X_train_transformed.toarray(), df_train[['n_tokens', 'language']].values])
X_test_transformed = np.hstack([X_test_transformed.toarray(), df_test[['n_tokens', 'language']].values])


# Check the number of columns.
print(f"\nNumber of columns in the transformed Training Set: {X_train_transformed.shape[1]}")
print(f"Number of columns in the transformed Test Set: {X_test_transformed.shape[1]}")


# Train model.
model = RandomForestClassifier()
model.fit(X_train_transformed, y_train)


# Predict and evaluate.
predictions_train = model.predict(X_train_transformed)
predictions_test = model.predict(X_test_transformed)


print(f'Training Accuracy: {accuracy_score(y_train, predictions_train)}')
print(f'Training Confusion Matrix:\n{confusion_matrix(y_train, predictions_train)}')


print(f'Test Accuracy: {accuracy_score(y_test, predictions_test)}')
print(f'Test Confusion Matrix:\n{confusion_matrix(y_test, predictions_test)}')


# Evaluate feature importance.
feature_importances = model.feature_importances_
features = vectorizer.get_feature_names() + ['n_tokens', 'language']


feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
print("\nFeature Importances:")
print(feature_importances_df.sort_values(by='Importance', ascending=False))


Training Set Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   score     object
 1   review    object
 2   n_tokens  int64 
 3   language  int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 96.1+ MB
None

Test Set Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   score     400000 non-null  object
 1   review    400000 non-null  object
 2   n_tokens  400000 non-null  int64 
 3   language  400000 non-null  int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 10.7+ MB
None

Missing values in Training Set:
score       0
review      0
n_tokens    0
language    0
dtype: int64

Missing values in Test Set:
score       0
review      0
n_tokens    0
language    0
dtype: int64

Unique labels in Training Set:




Feature Importances:
        Feature  Importance
272         not    0.050927
166       great    0.046026
456       waste    0.018608
234        love    0.016428
45         best    0.015982
..          ...         ...
3    absolutely    0.000437
63       camera    0.000413
304       phone    0.000410
427         toy    0.000378
501    language    0.000251

[502 rows x 2 columns]
