## Natural Language Processing

In [3]:
# Import necessary packages, take in data
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from gensim import matutils, models
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

Eval = pd.read_csv('ICE2_data_eval.csv')

In [4]:
# Turn dataset into long format
evalClean = pd.DataFrame(columns = ['eval', 'comment', 'category'])
for i in range(0, Eval.shape[1] // 2):
    cat_data = Eval.iloc[:, [i * 2, i * 2 + 1]]
    category = cat_data.columns[0]
    cat_data['category'] = category
    cat_data.columns = ['eval', 'comment', 'category']
    evalClean = evalClean.append(cat_data, ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_data['category'] = category


In [5]:
# Make text lowercase, remove punctuations and numbers
def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[0-9]+','', text)
    return text

evalClean['comment'] = evalClean['comment'].map(clean_text)

## Organizing Data

To allow for meaningful quantitatve analysis of the comments, we need to tokenize the texts and somehow simplify the entire set of tokens.  
There are two concepts of simplification:  
1. Bag-of-words: Treating each document as a collection of words, where order and grammar is irrelevant.  
2. Stop words: Eliminate commonly used words such as "a", "the", "is", "are", etc.  

We can go about automating this simplifying process by applying the `CountVectorizer()` from scikit-learn.  
This creates a documen-term matrix in a BOW style without the common stopwords.

In [15]:
# Create a document-term matrix using CountVectorizer
cv = CountVectorizer(stop_words = 'english')
commentCV = cv.fit_transform(evalClean['comment'])
commentCV_dtm  = pd.DataFrame(commentCV.toarray(), columns = cv.get_feature_names())
commentCV_dtm.index = evalClean['comment'].index
commentCV_dtm

Unnamed: 0,abilities,ability,able,abroad,absolutely,absurd,abt,academic,accessable,accitivties,...,works,world,worth,write,writing,wrong,yeah,year,years,yes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1108,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exploratory Analysis

In [7]:
evalClean.groupby(['eval', 'category']).count().unstack()

Unnamed: 0_level_0,comment,comment,comment,comment,comment,comment
category,coursecontent,examination,extracurricular,labwork,library_facilities,teaching
eval,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
-1,30,24,12,37,31,13
0,27,31,19,16,24,35
1,128,130,154,132,130,137


We find that for each category, there are about 120-150 positive (1) comments, and about 10-30 neutral(0)  and negative (1) comments each.  
The course reception seems to be positive overall, but there are a non-trivial number of people that were dissatisfied.

In [8]:
# Look at the document-term matrix, and calculate the most frequently used words among the documents
totalCT = commentCV_dtm.sum()
commentCV_dtm[totalCT.sort_values(ascending = False).index[:20]].sum()

good          654
excellent      74
students       62
university     61
library        48
books          47
course         43
pattern        40
teachers       39
lab            39
activities     37
knowledge      36
time           32
teaching       31
content        31
work           30
paper          30
checking       30
courses        29
average        29
dtype: int64

## Topic Modeling

In [10]:
tdm = commentCV_dtm.transpose()
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)
# id2word = dict((v, p) for p, v in cv.vocabulary_.items())
# lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 3, passes = 20)
# lda.print_topics()

sparse_counts

<988x1110 sparse matrix of type '<class 'numpy.int64'>'
	with 3728 stored elements in Compressed Sparse Row format>

## Classifying Text

In [16]:
Xs_docs = evalClean['comment']
Ys_evals = evalClean['eval']
Ys_evals = Ys_evals.astype('int64')
xs_training, xs_test, y_training, y_test = train_test_split(Xs_docs, Ys_evals, test_size = 0.2)

In [20]:
# Prepare the training features
cv = CountVectorizer(stop_words = 'english')
features = cv.fit_transform(xs_training)

#Train a multinomial Naive Bayes Model
model = MultinomialNB()
model.fit(features, y_training)

#Preparing the testing xs
feature_test = cv.transform(xs_test)

#Find the accuracy of the model
print(model.score(feature_test, y_test))

0.8063063063063063
