In [1]:
import pandas as pd
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


Import data to DF

In [14]:
train_data = []
test_data = []    
# Hardcoded pad naar de hoofdmap
base_path = r"raw_files/negative_polarity"   

# Loop door de klassen (deceptive en truthful)
for class_folder in os.listdir(base_path):
    class_path = os.path.join(base_path, class_folder)
        
    if os.path.isdir(class_path):
        subfolders = os.listdir(class_path)
        # pak de eerste 4 folder als train set
        for subfolder in subfolders[:-1]:  
            subfolder_path = os.path.join(class_path, subfolder)
            for file_name in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:

                    #lowercase alle woorden
                    content = file.read().lower()     
                    # Verwijder engelse stopwoorden
                    content = ' '.join([word for word in content.split() if word not in ENGLISH_STOP_WORDS])
                        
                    train_data.append((content, class_folder))
            
        # Pak de laatste folder als testset
        test_subfolder = subfolders[-1]
        test_subfolder_path = os.path.join(class_path, test_subfolder)
        for file_name in os.listdir(test_subfolder_path):
            file_path = os.path.join(test_subfolder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                #lowercase alle woorden
                content = file.read().lower() 
                    
                # Verwijder engelsestopwoorden
                content = ' '.join([word for word in content.split() if word not in ENGLISH_STOP_WORDS])

                test_data.append((content, class_folder))
    
    # Zet de gegevens om in een DataFrame
    train_df = pd.DataFrame(train_data, columns=['text', 'label'])
    test_df = pd.DataFrame(test_data, columns=['text', 'label'])

X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']


EDA

In [15]:
print(train_df.head())
print(test_df.head())
print(train_df.info())
print(test_df.info())
print(train_df.describe())
print(test_df.describe())
print(train_df['label'].value_counts())
print(test_df['label'].value_counts())


                                                text                 label
0  stayed schicago hilton 4 days 3 nights confere...  deceptive_from_MTurk
1  hotel located 1/2 mile train station quite hik...  deceptive_from_MTurk
2  reservation hilton chicago believing going sta...  deceptive_from_MTurk
3  people think hilton, think luxury. know did. w...  deceptive_from_MTurk
4  husband recently stayed stayed hilton chicago ...  deceptive_from_MTurk
                                                text                 label
0  recently stayed hotel allegro chicago wife. bu...  deceptive_from_MTurk
1  recently stayed hotel allegro chicago business...  deceptive_from_MTurk
2  recently visited chicago. stayed hotel allegro...  deceptive_from_MTurk
3  visited hotel allegro chicago vacation daughte...  deceptive_from_MTurk
4  unimpressed quality hotel. overall look place ...  deceptive_from_MTurk
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 2 column

Split train en test variables

--------------------------------------------------------------

Bayes unigram

In [88]:
#{'classifier__alpha': 0.5, 'vectorizer__min_df': 0.005} unigrams
#{'classifier__alpha': 2, 'vectorizer__min_df': 0.005} bigrams
vectorizer = CountVectorizer(ngram_range=(1, 1), min_df = 0.005)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])    
    
#     # Labels voor training en testen
# y_train = train_data['label']
# y_test = test_data['label']
    
    # Initialiseer het Multinomial Naive Bayes model
model_nb1 = MultinomialNB(alpha=0.5)   

    # Train het model
model_nb1.fit(X_train, y_train)
    
    # Voorspel de labels voor de testset
y_pred1 = model_nb1.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred1)
precision = precision_score(y_test, y_pred1, average='weighted')
recall = recall_score(y_test, y_pred1, average='weighted')
f1 = f1_score(y_test, y_pred1, average='weighted')

print("Multinomial_Naive_Bayes ({ngram_type}): ")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Multinomial_Naive_Bayes ({ngram_type}): 
Accuracy: 0.89375
Precision: 0.8938115330520393
Recall: 0.89375
F1 Score: 0.8937458494472441


Bayes bigram

In [89]:
#{'classifier__alpha': 0.5, 'vectorizer__min_df': 0.005} unigrams
#{'classifier__alpha': 2, 'vectorizer__min_df': 0.005} bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df = 0.005)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])    
    
#     # Labels voor training en testen
# y_train = train_data['label']
# y_test = test_data['label']
    
    # Initialiseer het Multinomial Naive Bayes model
model_nb2 = MultinomialNB(alpha=2)   

    # Train het model
model_nb2.fit(X_train, y_train)
    
    # Voorspel de labels voor de testset
y_pred2 = model_nb2.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred2)
precision = precision_score(y_test, y_pred2, average='weighted')
recall = recall_score(y_test, y_pred2, average='weighted')
f1 = f1_score(y_test, y_pred2, average='weighted')

print("Multinomial_Naive_Bayes ({ngram_type}): ")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Multinomial_Naive_Bayes ({ngram_type}): 
Accuracy: 0.90625
Precision: 0.9068220935690816
Recall: 0.90625
F1 Score: 0.906217029424407


logistic regression unigram

In [90]:
#{'classifier__C': 1, 'vectorizer__min_df': 0.01} unigrams
#{'classifier__C': 1, 'vectorizer__min_df': 0.001} bigrams

# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1,1), min_df= 0.01)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Initialiseer het Logistic Regression model met Lasso penalty
model_log1 = LogisticRegression(penalty='l1', solver='saga', C= 1)

# Train het model
model_log1.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred3 = model_log1.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred3)
precision = precision_score(y_test, y_pred3, average='weighted')  
recall = recall_score(y_test, y_pred3, average='weighted')        
f1 = f1_score(y_test, y_pred3, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                            
print("F1 Score:", f1) 

Accuracy: 0.84375
Precision: 0.8442340791738381
Recall: 0.84375
F1 Score: 0.8436950490406783




logistic regression bigrams

In [91]:
#{'classifier__C': 1, 'vectorizer__min_df': 0.01} unigrams
#{'classifier__C': 1, 'vectorizer__min_df': 0.001} bigrams

# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1,2), min_df= 0.001)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Initialiseer het Logistic Regression model met Lasso penalty
model_log2 = LogisticRegression(penalty='l1', solver='saga', C= 1)

# Train het model
model_log2.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred4 = model_log2.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred4)
precision = precision_score(y_test, y_pred4, average='weighted')  
recall = recall_score(y_test, y_pred4, average='weighted')        
f1 = f1_score(y_test, y_pred4, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                            
print("F1 Score:", f1) 

Accuracy: 0.825
Precision: 0.825814536340852
Recall: 0.825
F1 Score: 0.8248905565978737




classification tree unigrams

In [92]:
#{'classifier__ccp_alpha': 0.01, 'classifier__criterion': 'entropy', 'classifier__min_samples_split': 20, 'vectorizer__min_df': 0.05}
#{'classifier__ccp_alpha': 0.05, 'classifier__criterion': 'gini', 'classifier__min_samples_split': 2, 'vectorizer__min_df': 0.001}
# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1, 1), min_df = 0.05)

X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])   

# Initialiseer het Decision Tree model
model_tree1 = DecisionTreeClassifier(criterion= 'entropy', min_samples_split = 20, ccp_alpha= 0.01)

# Train het model
model_tree1.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred5 = model_tree1.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred5)
precision = precision_score(y_test, y_pred5, average='weighted')  
recall = recall_score(y_test, y_pred5, average='weighted')        
f1 = f1_score(y_test, y_pred5, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                           
print("F1 Score:", f1)

Accuracy: 0.65625
Precision: 0.6582528881152081
Recall: 0.65625
F1 Score: 0.6551589012108625


classification tree bigrams

In [93]:
#{'classifier__ccp_alpha': 0.01, 'classifier__criterion': 'entropy', 'classifier__min_samples_split': 20, 'vectorizer__min_df': 0.05}
#{'classifier__ccp_alpha': 0.05, 'classifier__criterion': 'gini', 'classifier__min_samples_split': 2, 'vectorizer__min_df': 0.001}
# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df = 0.001)

X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])   

# Initialiseer het Decision Tree model
model_tree2 = DecisionTreeClassifier(criterion= 'gini', min_samples_split = 2, ccp_alpha= 0.05)

# Train het model
model_tree2.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred6 = model_tree2.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred6)
precision = precision_score(y_test, y_pred6, average='weighted')  
recall = recall_score(y_test, y_pred6, average='weighted')        
f1 = f1_score(y_test, y_pred6, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                           
print("F1 Score:", f1)

Accuracy: 0.64375
Precision: 0.6437724644475699
Recall: 0.64375
F1 Score: 0.6437360834407594


--------------------------------------------------------------

random forrest unigram

In [94]:
#{'classifier__criterion': 'entropy', 'classifier__max_features': 'log2', 'classifier__n_estimators': 300, 'vectorizer__min_df': 0.01}
#{'classifier__criterion': 'entropy', 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 300, 'vectorizer__min_df': 0.005}

# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1,1), min_df= 0.01)

X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Initialiseer het Random Forest model
model_forrest1 = RandomForestClassifier(oob_score=True,criterion='entropy', max_features= 'log2', n_estimators=300)

# Train het model
model_forrest1.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred7 = model_forrest1.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred7)
precision = precision_score(y_test, y_pred7, average='weighted')  
recall = recall_score(y_test, y_pred7, average='weighted')        
f1 = f1_score(y_test, y_pred7, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                            
print("F1 Score:", f1) 

Accuracy: 0.85625
Precision: 0.8576470588235295
Recall: 0.85625
F1 Score: 0.8561094819159335


random forrest bigrams

In [95]:
#{'classifier__criterion': 'entropy', 'classifier__max_features': 'log2', 'classifier__n_estimators': 300, 'vectorizer__min_df': 0.01}
#{'classifier__criterion': 'entropy', 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 300, 'vectorizer__min_df': 0.005}

# Vectoriseer de tekstgegevens
vectorizer = CountVectorizer(ngram_range=(1,2), min_df= 0.005)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Initialiseer het Random Forest model
model_forrest2 = RandomForestClassifier(oob_score=True,criterion='entropy', max_features= 'sqrt', n_estimators=300)

# Train het model
model_forrest2.fit(X_train, y_train)

# Voorspel de labels voor de testset
y_pred8 = model_forrest2.predict(X_test)

# evalueer model
accuracy = accuracy_score(y_test, y_pred8)
precision = precision_score(y_test, y_pred8, average='weighted')  
recall = recall_score(y_test, y_pred8, average='weighted')        
f1 = f1_score(y_test, y_pred8, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)                                     
print("Recall:", recall)                                            
print("F1 Score:", f1) 

Accuracy: 0.8375
Precision: 0.8428571428571429
Recall: 0.8375
F1 Score: 0.836862745098039


--------------------------------------------------------------

Mc Nemar

In [97]:
from statsmodels.stats.contingency_tables import mcnemar

# Perform all model predictions
model_predictions = {
    'Naive Bayes (unigram)': y_pred1,
    'Naive Bayes (bigram)': y_pred2,
    'Logistic Regression (unigram)': y_pred3,
    'Logistic Regression (bigram)': y_pred4,
    'Classification Tree (unigram)': y_pred5,
    'Classification Tree (bigram)': y_pred6,
    'Random Forest (Unigram)': y_pred7,
    'Random Forest (Bigram)': y_pred8,
    
}

def perform_mcnemar_test(y_true, pred1, pred2, model1_name, model2_name):
    # Maak de contingentie tabel
    table = [[0, 0], [0, 0]]
    for i in range(len(y_true)):
        if pred1[i] == y_true[i] and pred2[i] == y_true[i]:
            table[0][0] += 1
        elif pred1[i] == y_true[i] and pred2[i] != y_true[i]:
            table[0][1] += 1
        elif pred1[i] != y_true[i] and pred2[i] == y_true[i]:
            table[1][0] += 1
        else:
            table[1][1] += 1

    # Voer de McNemar test uit
    result = mcnemar(table, exact=True)

    print(f"\nMcNemar's test results for {model1_name} vs {model2_name}")
    print(f"Statistic: {result.statistic:.4f}")
    print(f"P-value: {result.pvalue:.4f}")
    print("Null hypothesis: the two models have the same error rate")
    print(f"Reject null hypothesis: {result.pvalue < 0.05}")


vergelijking 1 linear models

In [100]:
# 1. Vergelijking van Multinomial Naive Bayes en Logistic Regression
perform_mcnemar_test(
    y_test,
    model_predictions['Naive Bayes (bigram)'],
    model_predictions['Logistic Regression (unigram)'],
    'Naive Bayes',
    'Logistic Regression'
)


McNemar's test results for Naive Bayes vs Logistic Regression
Statistic: 6.0000
P-value: 0.0525
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False


is random forrest beter dan linear models?

In [101]:
# 2. Vergelijking van Random Forest met lineaire classifiers
perform_mcnemar_test(
    y_test,
    model_predictions['Random Forest (Unigram)'],
    model_predictions['Naive Bayes (bigram)'],
    'Random Forest',
    'Naive Bayes'
)

perform_mcnemar_test(
    y_test,
    model_predictions['Random Forest (Unigram)'],
    model_predictions['Logistic Regression (unigram)'],
    'Random Forest',
    'Logistic Regression'
)


McNemar's test results for Random Forest vs Naive Bayes
Statistic: 6.0000
P-value: 0.1153
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False

McNemar's test results for Random Forest vs Logistic Regression
Statistic: 11.0000
P-value: 0.8388
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False


does bigram improve unigram?

In [102]:
# 3. Vergelijking van unigrams en bigrams
perform_mcnemar_test(
    y_test,
    model_predictions['Naive Bayes (unigram)'],
    model_predictions['Naive Bayes (bigram)'],
    'Naive Bayes (unigram)',
    'Naive Bayes (bigram)'
)

perform_mcnemar_test(
    y_test,
    model_predictions['Logistic Regression (unigram)'],
    model_predictions['Logistic Regression (bigram)'],
    'Logistic Regression (unigram)',
    'Logistic Regression (bigram)'
)

perform_mcnemar_test(
    y_test,
    model_predictions['Classification Tree (unigram)'],
    model_predictions['Classification Tree (bigram)'],
    'Classification Tree (unigram)',
    'Classification Tree (bigram)'
)

perform_mcnemar_test(
    y_test,
    model_predictions['Random Forest (Unigram)'],
    model_predictions['Random Forest (Bigram)'],
    'Random Forest (Unigram)',
    'Random Forest (Bigram)'
)



McNemar's test results for Naive Bayes (unigram) vs Naive Bayes (bigram)
Statistic: 2.0000
P-value: 0.6875
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False

McNemar's test results for Logistic Regression (unigram) vs Logistic Regression (bigram)
Statistic: 0.0000
P-value: 0.2500
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False

McNemar's test results for Classification Tree (unigram) vs Classification Tree (bigram)
Statistic: 18.0000
P-value: 0.8714
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False

McNemar's test results for Random Forest (Unigram) vs Random Forest (Bigram)
Statistic: 8.0000
P-value: 0.6476
Null hypothesis: the two models have the same error rate
Reject null hypothesis: False
