In [1]:
import numpy as np
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import ParameterGrid

In [2]:
train = pd.read_csv("hwk3_datasets/yelp-train.txt",delimiter='\t', dtype=str, header=None, names = ['Review','Score'])
test = pd.read_csv("hwk3_datasets/yelp-test.txt",delimiter='\t', dtype = str, header=None, names = ['Review','Score'])
val = pd.read_csv("hwk3_datasets/yelp-valid.txt",delimiter='\t', dtype = str, header = None, names = ['Review','Score'])

transtab = str.maketrans('','', string.punctuation)
for i,ex in enumerate(train['Review']):
    train['Review'][i] = ex.translate(transtab)
    train['Review'][i] = (train['Review'][i]).lower()
for i,ex in enumerate(test['Review']):
    test['Review'][i] = ex.translate(transtab) 
    test['Review'][i] = test['Review'][i].lower()
for i,ex in enumerate(val['Review']):
    val['Review'][i] = ex.translate(transtab) 
    val['Review'][i] = val['Review'][i].lower()
    
train['Score']=train['Score'].astype(np.int32)
test['Score']=test['Score'].astype(np.int32)
val['Score']=val['Score'].astype(np.int32)
                 
vectorizer = CountVectorizer(max_features = 10000, binary=True)

train_vectors = vectorizer.fit_transform(train['Review'])
test_vectors = vectorizer.transform(test['Review'])
val_vectors = vectorizer.transform(val['Review'])

In [3]:
def eval_classifier(classifier):    
   
    try:
        classifier.fit(train_vectors, train['Score'])
        train_y =classifier.predict(train_vectors)
        test_y = classifier.predict(test_vectors)
        val_y = classifier.predict(val_vectors)
       
    except:
        classifier.fit(train_vectors.toarray(), train['Score'])
        train_y =classifier.predict(train_vectors.toarray())
        test_y = classifier.predict(test_vectors.toarray())
        val_y = classifier.predict(val_vectors.toarray())        
        
    train_f1 = f1_score(train['Score'],train_y, average='macro')
    val_f1 = f1_score(val['Score'],val_y, average='macro')
    test_f1 = f1_score(test['Score'],test_y, average='macro')
    
    print(type(classifier))
    print(f"Train F1: {train_f1}")
    print(f"Val F1: {val_f1}")
    print(f"Test F1: {test_f1}")
    print("\n")
    return val_f1

In [4]:
def eval_classifier_quick(classifier):    
    try:
        classifier.fit(train_vectors, train['Score'])
        val_y = classifier.predict(val_vectors)        
    except:
        classifier.fit(train_vectors.toarray(), train['Score'])
        val_y = classifier.predict(val_vectors.toarray())          
        
    val_f1 = f1_score(val['Score'],val_y, average='macro')    
    return val_f1
    
def test_classifier(classifier):    
    try:
        classifier.fit(train_vectors, train['Score'])
        test_y = classifier.predict(test_vectors)        
    except:
        classifier.fit(train_vectors.toarray(), train['Score'])
        test_y = classifier.predict(test_vectors.toarray())          
        
    test_f1 = f1_score(test['Score'],test_y, average='macro')    
    return test_f1

In [5]:
random = DummyClassifier(strategy='uniform', random_state=10)
maj = DummyClassifier(strategy='most_frequent')

print(f"F1 Score of Random Classifier on Test: {test_classifier(random)}\n")
print(f"F1 Score of Majority Classifier on Test: {test_classifier(maj)}")

F1 Score of Random Classifier on Test: 0.17230888773618042

F1 Score of Majority Classifier on Test: 0.10392301998519615


  'precision', 'predicted', average, warn_for)


In [6]:
bayes_params = ParameterGrid({'alpha':[.01,.1,.5,1]})
tree_params = ParameterGrid({'random_state':[10],'max_depth':[None,10,100,1000],'min_samples_split':[2,5,10]})
svm_params = ParameterGrid({'random_state':[10],'loss':['hinge','squared_hinge'],'C':[.5,1.0,2.0]})

classifiers= [(BernoulliNB, bayes_params), (DecisionTreeClassifier, tree_params), (svm.LinearSVC, svm_params)]

In [7]:
def best_parameters(classifier, param_grid):
    best_Score=0
    best_params=None
    for params in param_grid:
        print(f"Trying: {params}")
        Score = eval_classifier_quick(classifier(**params))
        print(f"F1 Score Validation: {Score}\n")
        if Score>best_Score:
            best_Score=Score
            best_params=params       
            
    print(f"Best params for Validation: {best_params}")
    print(f"Best F1 Score on Validation: {best_Score}\n")
    return classifier(**best_params)

In [8]:
for pair in classifiers:
    classifier = pair[0]
    param_grid = pair[1]
    print(classifier)    
    best_classifier = best_parameters(classifier,param_grid) 
    print(f"Test score for best params: {test_classifier(best_classifier)}\n")

<class 'sklearn.naive_bayes.BernoulliNB'>
Trying: {'alpha': 0.01}
F1 Score Validation: 0.38051604534431227

Trying: {'alpha': 0.1}
F1 Score Validation: 0.3721284609021715

Trying: {'alpha': 0.5}
F1 Score Validation: 0.36264671577037066

Trying: {'alpha': 1}
F1 Score Validation: 0.3369298973133679

Best params for Validation: {'alpha': 0.01}
Best F1 Score on Validation: 0.38051604534431227

Test score for best params: 0.36572174954585585

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Trying: {'max_depth': None, 'min_samples_split': 2, 'random_state': 10}
F1 Score Validation: 0.25071684622351753

Trying: {'max_depth': None, 'min_samples_split': 5, 'random_state': 10}
F1 Score Validation: 0.26523413087325115

Trying: {'max_depth': None, 'min_samples_split': 10, 'random_state': 10}
F1 Score Validation: 0.25921883090992326

Trying: {'max_depth': 10, 'min_samples_split': 2, 'random_state': 10}
F1 Score Validation: 0.2877110115742789

Trying: {'max_depth': 10, 'min_samples_split': 5, 'ra