In [1]:
import string
import re
from time import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

import nltk
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score, fbeta_score

%matplotlib inline

RANDOM_STATE = 0




In [2]:
#nltk.download()

In [3]:
data_train_orig = pd.read_csv('./data/sa-emotions/train_data.csv')
data_train_orig.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
def train_predict(learner, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train, y_train)
    end = time() # Get end time
    
    # TODO: Calculate the training time
    #results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
   # results['pred_time'] = end  - start
            
    # TODO: Compute accuracy 
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score using fbeta_score()
    #results['f_train'] = fbeta_score(y_train, predictions_train, average=None, beta=0.5)
        
    # TODO: Compute F-score on the test set which is y_test
    #results['f_test'] = fbeta_score(y_test, predictions_test, average=None, beta=0.5)
       
        
    # Return the results
    return results

In [5]:
def process_text(text):
    ps = nltk.PorterStemmer()
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]
    return text

In [13]:
print(data_train_orig["content"][100])
process_text(data_train_orig["content"][100])

First ever dropped call on my mobile. On a call to @Telstra no less! ( being charged for data even though I have a data pack  )


['first',
 'ever',
 'drop',
 'call',
 'mobil',
 'call',
 'telstra',
 'less',
 'charg',
 'data',
 'even',
 'though',
 'data',
 'pack',
 '']

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=process_text)
tfidf_vect_fit = tfidf_vect.fit(data_train_orig['content'])

tfidf_train = tfidf_vect_fit.transform(data_train_orig['content'])

X_train_vect = pd.DataFrame(tfidf_train.toarray())

X_train_vect.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37555,37556,37557,37558,37559,37560,37561,37562,37563,37564
0,0.192934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:


y_train = data_train_orig['sentiment'].factorize()[0]
y_train = data_train_orig['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X_train_vect, y_train, random_state=RANDOM_STATE, test_size=0.2)



In [9]:
clf_RF = RandomForestClassifier(n_estimators=200, max_depth=None,random_state=RANDOM_STATE, n_jobs=-1)
clf_NB = MultinomialNB()
clf_SVC = SVC(random_state=RANDOM_STATE)
clf_LSVC = LinearSVC(random_state=RANDOM_STATE)
results = {}
for clf in [clf_RF, clf_NB]:#, clf_SVC, clf_LSVC]:
    clf_name= clf.__class__.__name__
    results[clf_name] = train_predict(clf, X_train, y_train, X_test, y_test)
    

In [10]:
results

{'RandomForestClassifier': {'acc_train': 0.9945, 'acc_test': 0.329},
 'MultinomialNB': {'acc_train': 0.424375, 'acc_test': 0.2793333333333333}}