In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

#Load the unclassified reviews from Json file
with open('unclassified.json', 'r') as f:
    data=json.load(f)
    
    #convert data into pandas dataframe
    df = pd.DataFrame(data, columns=['review'])
  
    
    #Create new columns for labels
    df['label']=None

In [2]:
df['sentiment'] = np.nan

# here we manually assign the sentiment for the first 5 rows to test our models later
df.iloc[:5, 1] = ['good', 'good', 'bad', 'bad', 'good'] * 5


ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)


In [None]:
vectorizers = {'count': CountVectorizer(), 'tfidf': TfidfVectorizer()}
models = {'mnb': MultinomialNB(), 'bnb': BernoulliNB(), 'lr': LogisticRegression(), 'svm': SVC()}

model_dict = {}


In [None]:
for vec_name, vec in vectorizers.items():
    for model_name, model in models.items():
        params = {}
        if 'count' in vec_name:
            params.update({'vec__ngram_range': [(1,1), (1,2)],
                            'vec__min_df': [1, 2],
                            'vec__max_df': [0.9, 0.95, 1.0]})
        else:
            params.update({'vec__ngram_range': [(1,1), (1,2)],
                            'vec__min_df': [1, 2],
                            'vec__max_df': [0.9, 0.95, 1.0],
                            'vec__use_idf': [True, False],
                            'vec__norm': ['l1', 'l2']})

        params.update({'clf__alpha': [0.1, 1.0],
                       'clf__penalty': ['l1', 'l2'],
                       'clf__C': [0.1, 1.0],
                       'clf__kernel': ['linear', 'rbf']})
        
        pipeline = Pipeline([('vec', vec), ('clf', model)])
        
        gs = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
        gs.fit(X_train, y_train)
        model_dict[f'{vec_name}_{model_name}'] = gs.best_estimator_


In [None]:
for model_name, model in model_dict.items():
    y_pred = model.predict(X_test)
    print(f'Model: {model_name}')
    print('f')
