In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!source /content/drive/MyDrive/colab_env/bin/activate;

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/colab_env/lib/python3.10/site-packages')

In [None]:
import numpy as np
import nltk

In [None]:
from nltk import pos_tag, regexp_tokenize, corpus, stem


In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### 1. Data ingestion

In [None]:
import csv

In [None]:
ratings = []
apps = []
reviews = []

#with open("./reviews_Apps_for_Android_5.training.txt", 'r') as tsv_file:
with open("/content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/reviews_Apps_for_Android_5.training.txt", 'r') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter = '\t')
    #next(tsv_reader)
    for row in tsv_reader:
        ratings.append(row[0])
        apps.append(row[1])
        reviews.append(row[2].lower())

print(apps[:5])


['B004A9SDD8', 'B004A9SDD8', 'B004A9SDD8', 'B004A9SDD8', 'B004A9SDD8']


### 2. Feature Extraction: Tokenisation, POS-Tagging, Lemmatisation

In [None]:
def my_tokenizer(reviews, length):

    doccie = reviews

    matrix = []

    stop_words = set(corpus.stopwords.words('english'))


    for line in doccie[:length]:
        wrds = regexp_tokenize(line, pattern = r"[\w']+")

        wrds = [wrd for wrd in wrds if not wrd in stop_words]

        pos_tagged_wrds = pos_tag(wrds)

        lemmatizer = stem.WordNetLemmatizer()
        lemmatised_tokens = []

        for wrd, pos in pos_tagged_wrds:
            wordnet_pos = corpus.reader.wordnet.NOUN
            if pos.startswith("V"):
                wordnet_pos = nltk.corpus.reader.wordnet.VERB
            elif pos.startswith("J"):
                wordnet_pos = nltk.corpus.reader.wordnet.ADJ
            elif pos.startswith("R"):
                wordnet_pos = nltk.corpus.reader.wordnet.ADV

            lemma = lemmatizer.lemmatize(wrd, pos=wordnet_pos)
            lemmatised_tokens.append(lemma)


        tokenized_line = ""
        for tkn in lemmatised_tokens:
            tokenized_line = tokenized_line + ' ' + tkn

        matrix.append(tokenized_line.lstrip())
    return matrix

##### Removing outliers (long reviews > 500 words)

In [None]:
#num = []
#for line in tokenized_doccie:
#  num.append(len(line))

#np.argmax(num)

In [None]:
#tokenized_doccie.pop(13341)

In [None]:
#tokenized_doccie.pop(7087)

In [None]:
#tokenized_doccie.pop(5058)

In [None]:
#tokenized_doccie.pop(17917)

In [None]:
#tokenized_doccie.pop(8810)

### 3. Vectorisation

##### 3.1 TF-IDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
def my_tf_idf_vectorizer(length):
  length = length
  tfidf_Vectorizer = TfidfVectorizer(ngram_range=(1,2))  #starting off with a n-gram up to two.
  matrix = my_tokenizer(reviews, length)
  tfidf_vectors = tfidf_Vectorizer.fit_transform(matrix)

  return tfidf_vectors.toarray()

##### 3.2 Word Embeddings (Word2vec)

In [None]:
from gensim.models import Word2Vec

In [None]:
def aggregate_line_vectors(line_vectors, method="average"):
    vectors = []

    line_vectors = line_vectors

    if method == "average":
      vectors.append(np.mean(line_vectors, axis=0))
    elif method == "sum":
      vectors.append(np.sum(line_vectors, axis=0))
    elif method == "weighted_average":
      # Example: Assign equal weights to all word vectors
      weights = np.ones(len(line_vectors))
      vectors.append(np.average(line_vectors, axis=0, weights=weights))
    else:
      raise ValueError(f"Invalid aggregation method: {method}")
    return vectors

In [None]:
def my_w2v_vectorizer(length):
  length = length
  word2vec_matrix = []

  matrix = my_tokenizer(reviews, length)

  for line in matrix:
    word2vec = Word2Vec(sentences=[line.split(' ')], vector_size=100, window=5, min_count=1, sg=0)
    line_vectors = word2vec.wv.vectors

    word2vec_matrix.append(aggregate_line_vectors((line_vectors), method="average"))

  b = [(word2vec_matrix[x][0]).T for x in range(len(word2vec_matrix))]
  return b

### 4. Machine Learning Models

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
from keras.layers import Dense, Dropout
from keras.models import Sequential

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef

In [None]:
import pickle
import joblib

In [None]:
from imblearn.over_sampling import SMOTE

 #### 4.1 Based on TF-IDF vectorisation

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
length = 5000
y = np.array([int(b) for b in ratings[:length]])
X = my_tf_idf_vectorizer(length)

- Trying to use the vectorized data in its original vectorised nature results in expensive computations.
- the 20 000 lines vectorized matrix has 250 000+ columns.
- when train_test_split the system crashes as it requires more than 12 GB of RAM.
- Alternative approach is to reduce the vector space by dimensional reduction via LSA/SVD.
- 20 000 lines still crash when ou try to decompose their columns from 250k+ to 100.
- another try for 10k lines was attempted and crushed again.
- 5k lines worked, only to crash when using the SMOTE function.

In [None]:
svd = TruncatedSVD(n_components=100)
X_svd = svd.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
unique, counts = np.unique(y_train_resampled, return_counts=True)
print(f"Class distribution after SMOTE: {dict(zip(unique, counts))}")

##### 4.1.1 SVM

In [None]:
svm = SVC(kernel="linear", decision_function_shape="ovr")

In [None]:
svm.fit(X_train, y_train)

In [None]:
ypreds = svm.predict(X_test)

In [None]:
print(confusion_matrix(y_test, ypreds))

[[ 53   0 118]
 [  7   0  98]
 [  4   0 720]]


In [None]:
print(classification_report(y_test, ypreds))

              precision    recall  f1-score   support

           1       0.83      0.31      0.45       171
           2       0.00      0.00      0.00       105
           3       0.77      0.99      0.87       724

    accuracy                           0.77      1000
   macro avg       0.53      0.43      0.44      1000
weighted avg       0.70      0.77      0.71      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
filename = '/content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/svm_model.sav'
pickle.dump(svm, open(filename, 'wb'))

print(f"SVM model saved as {filename}")

SVM model saved as /content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/svm_model.sav


In [None]:
#filename = '/content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/svm_model.joblib'
#joblib.dump(svm, filename)

#print(f"SVM model saved as {filename}")

SVM model saved as /content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/svm_model.joblib


##### 4.1.2 Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=60)

In [None]:
rf.fit(X_train, y_train)

In [None]:
ypreds = rf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, ypreds))

[[ 32   0 139]
 [  1   0 104]
 [  1   0 723]]


In [None]:
print(classification_report(y_test, ypreds))

              precision    recall  f1-score   support

           1       0.94      0.19      0.31       171
           2       0.00      0.00      0.00       105
           3       0.75      1.00      0.86       724

    accuracy                           0.76      1000
   macro avg       0.56      0.40      0.39      1000
weighted avg       0.70      0.76      0.67      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
filename = '/content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/rf_model.sav'
pickle.dump(rf, open(filename, 'wb'))

print(f"rf model saved as {filename}")

rf model saved as /content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/rf_model.sav


##### 4.1.3 Naive Bayes

In [None]:
import pandas as pd

In [None]:
R = pd.DataFrame(y_test)

In [None]:
R.columns = ['R']

In [None]:
R[R['R']==2]

Unnamed: 0,R
5,2
6,2
8,2
13,2
37,2
...,...
984,2
986,2
993,2
994,2


##### 4.1.4 Deep Neural Network

In [None]:
y

array(['2', '3', '3', ..., '3', '1', '3'], dtype='<U1')

In [None]:
a = ['4', '6']

In [None]:
[int(b) for b in a]

[4, 6]

#### 4.2 Based on Word2Vec embedding

In [None]:
length = 20000
y = np.array([int(b) for b in ratings[:length]])
X = my_w2v_vectorizer(length)

In [None]:
len(X)

20000

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
unique, counts = np.unique(y_train_resampled, return_counts=True)
print(f"Class distribution after SMOTE: {dict(zip(unique, counts))}")

Class distribution after SMOTE: {1: 11813, 2: 11813, 3: 11813}


##### 4.2.1 SVM

In [None]:
svm_w2v = SVC(kernel="linear", decision_function_shape="ovr")

In [None]:
svm_w2v.fit(X_train_resampled, y_train_resampled)

In [None]:
ypreds = svm_w2v.predict(X_test)

In [None]:
print(confusion_matrix(y_test, ypreds))

[[   2   50  527]
 [   1   27  395]
 [   3  163 2832]]


In [None]:
print(classification_report(y_test, ypreds))

              precision    recall  f1-score   support

           1       0.33      0.00      0.01       579
           2       0.11      0.06      0.08       423
           3       0.75      0.94      0.84      2998

    accuracy                           0.72      4000
   macro avg       0.40      0.34      0.31      4000
weighted avg       0.63      0.72      0.64      4000



In [None]:
svm_classifier = SVC()

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 'scale']
}

In [None]:
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validated Accuracy: {best_score:.4f}")


In [None]:
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy with Best Model: {test_accuracy:.4f}")

##### 4.2.2 Random Forest

In [None]:
rf_w2v = RandomForestClassifier(n_estimators=600)

In [None]:
rf_w2v.fit(X_train, y_train)

In [None]:
ypreds = rf_w2v.predict(X_test)

In [None]:
print(confusion_matrix(y_test, ypreds))

[[  12    3  582]
 [   6    1  434]
 [  58   37 2867]]


In [None]:
print(classification_report(y_test, ypreds))

              precision    recall  f1-score   support

           1       0.16      0.02      0.04       597
           2       0.02      0.00      0.00       441
           3       0.74      0.97      0.84      2962

    accuracy                           0.72      4000
   macro avg       0.31      0.33      0.29      4000
weighted avg       0.57      0.72      0.63      4000



- Balancing the training data using synthetic generated training data

In [None]:
rf_w2v = RandomForestClassifier(n_estimators=100)

In [None]:
rf_w2v.fit(X_train_resampled, y_train_resampled)

In [None]:
ypreds = rf_w2v.predict(X_test)

In [None]:
print(confusion_matrix(y_test, ypreds))

[[ 104  118  351]
 [  70  107  252]
 [ 461  671 1866]]


In [None]:
print(classification_report(y_test, ypreds))

              precision    recall  f1-score   support

           1       0.16      0.18      0.17       573
           2       0.12      0.25      0.16       429
           3       0.76      0.62      0.68      2998

    accuracy                           0.52      4000
   macro avg       0.35      0.35      0.34      4000
weighted avg       0.60      0.52      0.55      4000



In [None]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Define hyperparameters and their possible values
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


In [None]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validated Accuracy: {best_score:.4f}")


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best Cross-Validated Accuracy: 0.6274


In [None]:
best_rfw2v_model = grid_search.best_estimator_
test_accuracy = best_rfw2v_model.score(X_test, y_test)
print(f"Test Accuracy with Best Model: {test_accuracy:.4f}")


Test Accuracy with Best Model: 0.4778


In [None]:
filename = '/content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/best_rfw2v_model.sav'
pickle.dump(best_rfw2v_model, open(filename, 'wb'))

print(f"rf model saved as {filename}")

rf model saved as /content/drive/MyDrive/MSc Data Science & Artificial Intelligence/2nd Semester/CIS4515 Practical Data Analysis/CW2/best_rfw2v_model.sav


##### 4.2.3 Naive Bayes

In [None]:
nb_classifier = MultinomialNB()

param_grid = {
    'alpha': [0.1, 1, 10],
    'fit_prior': [True, False]
}


In [None]:
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validated Accuracy: {best_score:.4f}")


In [None]:
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy with Best Model: {test_accuracy:.4f}")


##### 4.2.4 Deep Neural Network

In [None]:
def build_dnn_model(hidden_units=64, activation='relu'):
    model = Sequential()
    model.add(Dense(hidden_units, activation=activation, input_shape=(input_dim,)))
    # Add more layers as needed
    model.add(Dense(1, activation='sigmoid'))  # Binary classification example
    return model

param_grid = {
    'hidden_units': [32, 64, 128],
    'activation': ['relu', 'tanh']
}


In [None]:
def build_dnn_model(hidden_units=64, activation='relu'):
    model = Sequential()
    model.add(Dense(hidden_units, activation=activation, input_shape=(n_features,)))
    model.add(Dense(hidden_units, activation=activation))
    model.add(Dense(n_classes, activation='softmax'))  # Multiclass classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
dnn_classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=build_dnn_model, epochs=10, batch_size=32, verbose=0)
grid_search = GridSearchCV(estimator=dnn_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validated Accuracy: {best_score:.4f}")


In [None]:
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy with Best Model: {test_accuracy:.4f}")
