### Part2

### Instructions: Please run the program from top to bottom as some are related

In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split 
import sklearn
import random
import nltk
import operator

# import the training and test data set
df_train_Pos = pd.read_csv('imdb_train_pos.txt',delimiter='\n', header=None)
df_train_Neg = pd.read_csv('mdb_train_neg.txt',delimiter='\n', header=None)
df_test_Pos = pd.read_csv('imdb_test_pos.txt',delimiter='\n', header=None)
df_test_Neg = pd.read_csv('imdb_test_neg.txt',delimiter='\n', header=None)

# add column name for revgiews
df_train_Pos.columns = ['text']
df_train_Neg.columns = ['text']
df_test_Pos.columns = ['text']
df_test_Neg.columns = ['text']

# add label, "1" for positive reviews and "0" for negative review
df_train_Pos['label'] = '1'
df_train_Neg['label'] = '0'
df_test_Pos['label'] = '1'
df_test_Neg['label'] = '0'

# concatenate two dataframes to get the full train and test data set
df_train = pd.concat([df_train_Pos,df_train_Neg])
df_test = pd.concat([df_test_Pos,df_test_Neg])

df_train.head()

Unnamed: 0,text,label
0,"For fans of Chris Farley, this is probably his...",1
1,"Fantastic, Madonna at her finest, the film is ...",1
2,From a perspective that it is possible to make...,1
3,What is often neglected about Harold Lloyd is ...,1
4,You'll either love or hate movies such as this...,1


### 1. Word Frequency

In [2]:
lemmatizer = nltk.stem.WordNetLemmatizer()

# process the word to lowercase and its singular form
def get_list_tokens(string):
  sentence_split = nltk.tokenize.sent_tokenize(string)
  list_tokens = []
    
  for sentence in sentence_split:
    list_tokens_sentence = nltk.tokenize.word_tokenize(sentence)
    
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
    
  return list_tokens

### Calculate the word frequency

In [3]:
# obtain stopwords list from nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

# add more words to stopword lsit
stopwords.add(".")
stopwords.add(",")
stopwords.add(";")
stopwords.add("-")
stopwords.add("/")
stopwords.add("<")
stopwords.add("br")
stopwords.add(">")
stopwords.add(")")
stopwords.add("``")
stopwords.add("''")
stopwords.add("...")
stopwords.add("!")
stopwords.add("?")
stopwords.add("(")
stopwords.add("'s")
stopwords.add("n't")


# create a frequency list for words
frequencyList = {}

for review in df_train['text']:
  sentence_tokens = get_list_tokens(review) # lemmatise the sentence

  for word in sentence_tokens:
    if word in stopwords: 
        continue   # ignore the stopwords
    if word not in frequencyList: 
        frequencyList[word] = 1  # create a new word in dict and frequency = 1
    else: 
        frequencyList[word] += 1   # frequency of this word plus 1 if it exist

# sort frequency list with top 1000 words
sortList = sorted(frequencyList.items(), key = operator.itemgetter(1), reverse = True)[:1000]

i = 0
for word,frequency in sortList[:15]:  # print first 15 most frequent words
  i += 1   # index of word frequency
  print (str(i) + ". " + word + " - " + str(frequency)) 
  
# create a vocabulary based on the sorted frequency list 
vocabulary = []
for word,frequency in sortList:
  vocabulary.append(word)


print(vocabulary[:5])

1. movie - 29648
2. wa - 29577
3. film - 26929
4. one - 15987
5. like - 11876
6. ha - 9893
7. time - 8589
8. good - 8378
9. character - 8318
10. would - 7867
11. even - 7321
12. get - 7199
13. make - 7072
14. see - 7047
15. story - 6843
['movie', 'wa', 'film', 'one', 'like']


In [4]:
def get_vocabulary(training_set, num_features): # retrieve the vocabulary
  dict_word_frequency = {}
  for instance in training_set:
    sentence_tokens = get_list_tokens(instance[0])
    for word in sentence_tokens:
      if word in stopwords: continue
      if word not in dict_word_frequency: dict_word_frequency[word] = 1
      else: dict_word_frequency[word]+=1
  sorted_list = sorted(dict_word_frequency.items(), key = operator.itemgetter(1), reverse=True)[:num_features]
  vocabulary = []
    
  for word,frequency in sorted_list:
    vocabulary.append(word)
  return vocabulary

In [5]:
# transform sentence into vector
def get_vector_text(list_vocab,string):
  vector_text = np.zeros(len(list_vocab))
  list_tokens_string = get_list_tokens(string)
    
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:        
      vector_text[i] = list_tokens_string.count(word)
    
  return vector_text

In [6]:
def train_svm_classifier(training_set, vocabulary): # function for training svm classifier
  X_train = []
  Y_train = []
  for instance in training_set:
    vector_instance = get_vector_text(vocabulary,instance[0])
    X_train.append(vector_instance)
    Y_train.append(instance[1])
    
  # train the SVM classifier 
  svm_clf = sklearn.svm.SVC(kernel = "linear",gamma = 'auto')
  svm_clf.fit(np.asarray(X_train),np.asarray(Y_train))
  return svm_clf

### Train the model using word frequency feature

In [7]:
# transform review into vector and stored in list
x_train_fre = []
for review in df_train['text']:
  reviewVector = get_vector_text(vocabulary,review)
  x_train_fre.append(reviewVector)
    
x_test_fre = []
for review in df_test['text']:
  reviewVector = get_vector_text(vocabulary,review)
  x_test_fre.append(reviewVector)
    
# obtain the label for reviews
y_train_fre = df_train['label']
y_test_fre = df_test['label']

# transform list to array
x_trainArray_fre = np.asarray(x_train_fre)
x_testArray_fre = np.asarray(x_test_fre)

# train the model with training set
svmModel=sklearn.svm.SVC(gamma='auto')
svmModel.fit(x_trainArray_fre,y_train_fre) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Results of prediction

In [8]:
# obtain predicted value by pass test set into trained model
y_pred_fre = svmModel.predict(x_testArray_fre)

# comapre with actual value
df_Result_fre = pd.DataFrame({'Predicted Value': y_pred_fre,'Actual Value': y_test_fre})
df_Result_fre.head(20)

Unnamed: 0,Predicted Value,Actual Value
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,0,1
8,0,1
9,1,1


In [9]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.metrics import confusion_matrix

# calculate precision, recall, f-measure and accuracy
precision = precision_score(y_test_fre, y_pred_fre, average='macro')
recall = recall_score(y_test_fre, y_pred_fre, average='macro')
f1 = f1_score(y_test_fre, y_pred_fre, average='macro')
accuracy = accuracy_score(y_test_fre, y_pred_fre)

print ("Precision: " + str(round(precision,3)))
print ("Recall: " + str(round(recall,3)))
print ("F1-Score: " + str(round(f1,3)))
print ("Accuracy: " + str(round(accuracy,3)))

# obtain confusion matrix
print (confusion_matrix(y_test_fre, y_pred_fre))

Precision: 0.848
Recall: 0.846
F1-Score: 0.846
Accuracy: 0.846
[[2036  465]
 [ 304 2195]]


### 2. TF-IDF: Transforming the sentences into weighted frequency features using TFidfVectorizer. 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = nltk.stem.WordNetLemmatizer()

# process the word to lowercase and its singular format
def get_list_tokens(string):
  sentence_split = nltk.tokenize.sent_tokenize(string)
  list_tokens = []
    
  for sentence in sentence_split:
    list_tokens_sentence = nltk.tokenize.word_tokenize(sentence)
    
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
    
  return list_tokens

# obtain stopwords list from nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

# add more words to stopword lsit
stopwords.add(".")
stopwords.add(",")
stopwords.add(";")
stopwords.add("-")
stopwords.add("/")
stopwords.add("<")
stopwords.add("br")
stopwords.add(">")
stopwords.add(")")
stopwords.add("``")
stopwords.add("''")
stopwords.add("...")
stopwords.add("!")
stopwords.add("?")
stopwords.add("(")
stopwords.add("'s")
stopwords.add("n't")

In [11]:
# create a vectorizer
vectorizer = TfidfVectorizer(use_idf = True, stop_words = stopwords, max_features = 500, tokenizer = get_list_tokens)

# transform feature by vectoriser
x_train_tf = vectorizer.fit_transform(df_train['text']).toarray()
print(x_train_tf.shape)   # shape = [n_samples, n_features]
x_test_tf = vectorizer.transform(df_test['text']).toarray()

# obtain intended label for review
y_train_tf = df_train['label']
y_test_tf = df_test['label']


# train a svm model
svmModel_tf=sklearn.svm.SVC(gamma='auto')
svmModel_tf.fit(x_train_tf,y_train_tf)

  'stop_words.' % sorted(inconsistent))


(15000, 500)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Result for TF-IDF:


In [12]:
# obtain predicted label for test
y_pred_tf = svmModel_tf.predict(x_test_tf)

# compare with actual value
df_Result = pd.DataFrame({'Predicted Value': y_pred_tf,'Actual Value': y_test_tf})
df_Result.head(50)

Unnamed: 0,Predicted Value,Actual Value
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,0,1
9,0,1


In [13]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.metrics import confusion_matrix

# calculate precision, recall, f-measure and accuracy
precision = precision_score(y_test_tf, y_pred_tf, average = 'macro')
recall = recall_score(y_test_tf, y_pred_tf, average = 'macro')
f1 = f1_score(y_test_tf, y_pred_tf, average = 'macro')
accuracy = accuracy_score(y_test_tf, y_pred_tf)

print ("Precision: " + str(round(precision,3)))
print ("Recall: " + str(round(recall,3)))
print ("F1-Score: " + str(round(f1,3)))
print ("Accuracy: " + str(round(accuracy,3)))

print (confusion_matrix(y_test_tf, y_pred_tf))

Precision: 0.78
Recall: 0.78
F1-Score: 0.78
Accuracy: 0.78
[[1927  574]
 [ 527 1972]]


### 3. Word Length

In [14]:
# add length column to dataframe and filled with length of review
df_train['length'] = df_train['text'].apply(len)
df_test['length'] = df_test['text'].apply(len)

df_train.head()

Unnamed: 0,text,label,length
0,"For fans of Chris Farley, this is probably his...",1,629
1,"Fantastic, Madonna at her finest, the film is ...",1,302
2,From a perspective that it is possible to make...,1,1102
3,What is often neglected about Harold Lloyd is ...,1,4316
4,You'll either love or hate movies such as this...,1,683


In [15]:
x_train_len = df_train['length']
y_train_len = df_train['label']

x_test_len = df_test['length']
y_test_len = df_test['label']

# transform list to array
x_trainArray_len = np.asarray(x_train_len).reshape(-1, 1)
x_testArray_len = np.asarray(x_test_len).reshape(-1, 1)

# train the svm model
svmModel_len=sklearn.svm.SVC(gamma='auto')
svmModel_len.fit(x_trainArray_len,y_train_len) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Result for model using length to predict

In [16]:
# obtain predicted label for review
y_pred_len = svmModel_len.predict(x_testArray_len)

# compare with actual value
df_Result_len = pd.DataFrame({'Predicted Value': y_pred_len,'Actual Value': y_test_len})
df_Result_len.head(20)

Unnamed: 0,Predicted Value,Actual Value
0,1,1
1,0,1
2,0,1
3,1,1
4,1,1
5,0,1
6,1,1
7,1,1
8,0,1
9,0,1


In [17]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.metrics import confusion_matrix

# calculate precision, recall, f-measure and accuracy
precision = precision_score(y_test_len, y_pred_len, average = 'macro')
recall = recall_score(y_test_len, y_pred_len, average = 'macro')
f1 = f1_score(y_test_len, y_pred_len, average = 'macro')
accuracy = accuracy_score(y_test_len, y_pred_len)

print ("Precision: " + str(round(precision,3)))
print ("Recall: " + str(round(recall,3)))
print ("F1-Score: " + str(round(f1,3)))
print ("Accuracy: " + str(round(accuracy,3)))

print (confusion_matrix(y_test_len, y_pred_len))

Precision: 0.509
Recall: 0.509
F1-Score: 0.509
Accuracy: 0.509
[[1240 1261]
 [1195 1304]]


### Combine features of word frequency and tf-idf to train a model

In [18]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

# concatenate two types of featuress
x_train_all = []
x_train_all = np.hstack(( x_trainArray_fre,x_train_tf ))
x_test_all = []
x_test_all = np.hstack(( x_testArray_fre,x_test_tf ))

# obtain label
y_train_all = df_train['label']
y_test_all = df_test['label']

# use Chi-square and Select K Best to select top 500 relevant features
sentAnalysis = SelectKBest(chi2, k=500).fit(x_train_all, y_train_all)

# obtain new features
X_train_new = sentAnalysis.transform(x_train_all)
X_test_new = sentAnalysis.transform(x_test_all)

# train the model
svmModel_all=sklearn.svm.SVC(gamma='auto')  
svmModel_all.fit(X_train_new,y_train_all) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Result for combined model

In [19]:
# obtain predicted value
y_pred_all = svmModel_all.predict(X_test_new)

# compare with actual label
df_Result_all = pd.DataFrame({'Predicted Value': y_pred_all,'Actual Value': y_test_all})
df_Result_all.head(20)

Unnamed: 0,Predicted Value,Actual Value
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,0,1
8,0,1
9,1,1


In [20]:
precision = precision_score(y_test_all, y_pred_all, average = 'macro')
recall = recall_score(y_test_all, y_pred_all, average = 'macro')
f1 = f1_score(y_test_all, y_pred_all, average = 'macro')
accuracy = accuracy_score(y_test_all, y_pred_all)

# calculate precision, recall, f-measure and accuracy
print ("Precision: " + str(round(precision,3)))
print ("Recall: " + str(round(recall,3)))
print ("F1-Score: " + str(round(f1,3)))
print ("Accuracy: " + str(round(accuracy,3)))

# obtain confusion matrix
print (confusion_matrix(y_test_all, y_pred_all))

Precision: 0.849
Recall: 0.848
F1-Score: 0.848
Accuracy: 0.848
[[2036  465]
 [ 296 2203]]
