## Text analysis on the MOUD dataset

This notebook contains a model to evaluate text data from the MOUD dataset using bag of words and TF-IDF.

In [1]:
# The path of the train and test transcriptions
# The data is seperated in an 80-20 ratio and the test directory is untouched. 
train_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\train\*.csv"
test_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\test\*.csv"

In [2]:
import glob
import numpy as np
import pandas as pd
import sklearn

In [3]:
df = pd.DataFrame()
for f in glob.glob(train_path):
    df = df.append(pd.read_csv(f,sep=';'),ignore_index=True)
# df = pd.concat((pd.read_csv(f,sep=';'),ignore_index=True for f in glob.glob(path)))
df.head()

Unnamed: 0,#endtime,#starttime,Speech,sentimentAnnotation,sentimentAnnotations,sentimentannotations,speech,transcription
0,3.642,0.0,,,-1.0,,,yo habia visto resenas que decian que picaba c...
1,9.552,3.642,,,-1.0,,,y la verdad es que si la use una vez y t- y te...
2,14.197,9.552,,,-1.0,,,y dije no: puede ser posible tanto la deseaba ...
3,20.545,14.197,,,-1.0,,,esta tambien tira un poquito de pelo pero haga...
4,23.275,20.545,,,1.0,,,pero igual con las lavadas se ha dejado de tir...


As shown, there are multiple speech and annotation columns. This requires data consolidation and managing missing values.    

In [4]:
# funcion to append all utterances to dataframe
def create_data_df(df_name,data_path):
    
    # Creating dataframe of entire transcriptions
    for f in glob.glob(data_path):
        df_name = df_name.append(pd.read_csv(f,sep=';'),ignore_index=True)
    
    # combine multiple speech, annotation columns to one and drop rest of columns
    if 'Speech' not in df_name.columns:
        df_name['Speech'] = ''    
    if 'speech' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','speech']].fillna('').sum(axis=1)   
    if 'transcription' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','transcription']].fillna('').sum(axis=1)
    
    if 'sentimentAnnotation' not in df_name.columns:
        df_name['sentimentAnnotation'] = 0    
    if 'sentimentAnnotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentAnnotations']].fillna(0).sum(axis=1)
    if 'sentimentannotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentannotations']].fillna(0).sum(axis=1)
    
    # Remove neutral annotations
    df_name = df_name.query('sentimentAnnotation != 0')
    
    df_name = df_name[['Speech','sentimentAnnotation']].reset_index(drop=True)  
    return df_name

Cleaned transcriptions

In [5]:
df = pd.DataFrame()
df_t = pd.DataFrame()
df = create_data_df(df,train_path)
df_t = create_data_df(df_t,test_path)
df_t.head()

Unnamed: 0,Speech,sentimentAnnotation
0,pero de verdad lo recomiendo,1
1,porque es que: eh: tiene de todo o sea no es el:,1
2,no he leido nunca ningun libro de esos zombies,-1
3,pero de peliculas y tal a mi que tampoco me su...,1
4,"que la verdad no me gusta, pero no estamos hab...",-1


#### ALTERNATE Converting video to test data

This alternate section is for feeding a video to a trained model and perform sentiment analysis on it. A video is fed to the path in the next code section and IBM Bluemix Speech to Text is used to obtain the transcriptions.

Else skip to Data processing.

In [None]:
# Convert video to audio, insert path
# vpath = r"C:\Datasets\MOUD\VideoReviews\178_makeup.mp4"

# Using a wav from video-audio-converter as video file did not operate as intended 
apath = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Python\MOUD\Text_Video\178_makeup.wav"

In [None]:
# Using IBM Bluemix to convert Speech to Text 
import json
from os.path import join, dirname
from watson_developer_cloud import SpeechToTextV1

speech_to_text = SpeechToTextV1(
    username='d6663de6-8281-4351-a718-4405ea503686',
    password='TAGa0GJdBrS8',
    x_watson_learning_opt_out=False
)

with open(apath,
          'rb') as audio_file:
    trn = speech_to_text.recognize(
        audio_file, content_type='audio/wav', timestamps=False, model='es-ES_BroadbandModel',
        word_confidence=False)

In [None]:
df_t = pd.DataFrame(columns = ['Speech','sentimentAnnotation'])
for i in range(len(trn['results'])):
    df_t.loc[len(df_t)]=[trn['results'][i]['alternatives'][0]['transcript'],'Unknown']
print("Converted Speech to Text utterances in the Speech column")
df_t.head()

### Data cleaning and text preprocessing

In [6]:
# from https://www.kaggle.com/c/word2vec-nlp-tutorial/
import re
from bs4 import BeautifulSoup
import nltk

# execute the following commented step to install the data packages if you don't already have it  
# nltk.download()

from nltk.corpus import stopwords

# resuable function to convert raw speech to preprocessed
def utterance_to_words(raw_utterance):
    # 1. Removing HTML elements from text
    utterance_text = BeautifulSoup(raw_utterance, "lxml").get_text()
    # 2. Keeping only letters
    letters_only = re.sub("[^a-zA-Z]", " ", utterance_text) 
    # 3. Converting to lower case and splitting into individual words
    lowercase_words = letters_only.lower().split()
    # 4. converting the stop words to a set to help faster execution
    spanish_stops = set(stopwords.words("spanish"))
    # 5. Removing stop words from the text
    meaningful_words = [w for w in lowercase_words if not w in spanish_stops]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))
    
# applying the function to the speech columns
df['Speech'] = df['Speech'].apply(lambda x: utterance_to_words(x))
df_t['Speech'] = df_t['Speech'].apply(lambda x: utterance_to_words(x))
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,habia visto resenas decian picaba usabas,-1.0
1,verdad si use vez t arde asi usas arde ojo,-1.0
2,dije puede ser posible deseaba arde voy poder ...,-1.0
3,tambien tira poquito pelo hagan cuenta quebra ...,-1.0
4,igual lavadas dejado tirar,1.0


### Machine learning

In [7]:
X_trn, y_trn = df[['Speech']],df[['sentimentAnnotation']]

# countVectorizer initialization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             lowercase = True,    \
                             max_features = 5000) 

# create bag of words vector for the training set using countVectorizer
train_data_features = vectorizer.fit_transform(X_trn['Speech'].values)

# tf-idf transformer initialization
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

# create tfidf transformed vector  for the training set using tf-idf transformer
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)

In [8]:
X_tst, y_tst = df_t[['Speech']],df_t[['sentimentAnnotation']]

# transformation of test data
test_data_features = vectorizer.transform(X_tst['Speech'].values)
X_test_tfidf = tfidf_transformer.transform(test_data_features)

In [9]:
# SVM model creation and fitting train vector to annotations

from sklearn import svm
model_tf = svm.SVC(kernel='linear', C=1, gamma=1).fit(X_train_tfidf,y_trn['sentimentAnnotation'].values)

# generate predictions
predicted_tf = model_tf.predict(X_test_tfidf)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf))

#create df to show results
disp = X_tst.join(y_tst).reset_index(drop=True).join(pd.DataFrame(predicted_tf,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))

scores = model_tf.score(X_test_tfidf,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative'))
                            
print("Predicted mean sentiment: {!r}.".format('Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

             precision    recall  f1-score   support

         -1       0.62      0.90      0.73        41
          1       0.89      0.57      0.70        54

avg / total       0.77      0.72      0.71        95

Accuracy: 0.72 (+/- 0.00)

Mean sentiment: 'Positive'.
Predicted mean sentiment: 'Negative'.


Unnamed: 0,Speech,sentimentAnnotation,Prediction,Right/Wrong
0,verdad recomiendo,1,1.0,True
1,eh,1,-1.0,False
2,leido nunca ningun libro zombies,-1,1.0,False
3,peliculas tal tampoco suelen hacer mucha graci...,1,-1.0,False
4,verdad gusta hablando,-1,1.0,False


In [10]:
# cross validation of training set using SVM
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, X_train_tfidf, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross validation accuracy: 0.64 (+/- 0.15)
