In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
def clean_review(review):
    '''Clean the text, with the option to remove stopwords'''
    
    # Convert words to lower case and split them
    review = review.lower()
    # Clean the text
    review = re.sub(r"<br />", " ", review)
    review = re.sub(r"[^a-z]", " ", review)
    review = re.sub(r"   ", " ", review) # Remove any extra spaces
    review = re.sub(r"  ", " ", review)
    #remove stopwords
    tokenized = word_tokenize(review)
    review = [w for w in tokenized if not w in stop_words]
    review = " ".join(review)

    # Return a list of words
    return(review)


def clean_data(train_reviews,test_reviews) :
    '''Input - train and test reviews
    Output - cleaned train and test reviews'''
    #start code here
    train_reviews_list =[]
    test_reviews_list = []
    
    for t_re in train_reviews:
        train_reviews_list.append(clean_review(t_re))
        
    for te_re in test_reviews:
        test_reviews_list.append(clean_review(te_re))
    
    return train_reviews_list,test_reviews_list
    #end code here
    
def create_bag_words(train_reviews,test_reviews) :
    '''Input - train and test reviews
    Output - the trained vectorizer and train and test feature matrix'''
    #start code here
    vectorizer = CountVectorizer()
    
    vectorizer.fit(train_reviews)
    
    tr = vectorizer.transform(train_reviews)
    te = vectorizer.transform(test_reviews)

    return vectorizer,tr.toarray(),te.toarray()
    
    #end code here

In [3]:
stop_words = stopwords.words("english")
stop_words = set(stop_words)

In [4]:
train = pd.read_csv('trainn.csv')
test = pd.read_csv('testt.csv')

train_reviews = list(train.review)
test_reviews = list(test.review)

y_train = train['label']
y_test = test.label


train_reviews,test_reviews = clean_data(train_reviews,test_reviews)

vectorizer,X_train,X_test = create_bag_words(train_reviews,test_reviews)


model = DecisionTreeClassifier(criterion = "entropy", random_state=13)
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

conf = confusion_matrix(y_test, y_pred)
print(conf)
print(classification_report(y_test, y_pred))

[[347 153]
 [131 369]]
              precision    recall  f1-score   support

           0       0.73      0.69      0.71       500
           1       0.71      0.74      0.72       500

    accuracy                           0.72      1000
   macro avg       0.72      0.72      0.72      1000
weighted avg       0.72      0.72      0.72      1000

