Libraries:

In [94]:
import re
import numpy as np
import pandas as pd
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hjalt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hjalt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hjalt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Loading csv using pandas

Part 1, Task 1:

In [95]:
df = pd.read_csv('news_sample.csv')

In [96]:
def clean_text(df, column):
    """
    clean_text() takes a dataframe as input. It lowercases everything
    followed by applying reg-ex patterns to the dataframe. \n
    The reg-ex patterns purpose is to match specific patterns in the dataframe that we wish to replace. \n
    All urls, emails, dates, numbers and timestamps are replaced by: <URL>, <EMAIL>, <DATE>, <TIME> and <NUM>
    """
    
    df = df.apply(lambda x: x.str.lower() if isinstance(x, str) == 'object' else x) #lowercase everything

    #reg-ex patterns
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    email_pattern = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
    numbers_pattern = re.compile(r'[0-9]')
    days = r'([0-3]?\d)'
    months = r'(0?[1-9]|11|12|10)'
    years = r'((19|20)\d\d)'
    delimiter = r'([\/\-\._])?' 
    timestamp = r'([^:][0-2]?\d):([0-5]?\d):([0-5]?\d).([\d]*)'
    dates = f"""
            # YYYY-MM-DD
            ({years}{delimiter}{months}{delimiter}{days})
            |
            # YYYY-DD-MM
            ({years}{delimiter}{days}{delimiter}{months})
            |
            # DD-MM-YYYY
            ({days}{delimiter}{months}{delimiter}{years})
            |
            # MM-DD-YYYY
            ({months}{delimiter}{days}{delimiter}{years})
            """

    date_pattern = re.compile(dates, re.IGNORECASE | re.VERBOSE | re.UNICODE)

    #applying reg-ex patterns
    df[column] = df[column].apply(lambda x: re.sub(url_pattern, '<URL>', str(x))) 
    df[column] = df[column].apply(lambda x: re.sub(email_pattern, '<EMAIL>', str(x))) 
    df[column] = df[column].apply(lambda x: re.sub(date_pattern, '<DATE>', str(x))) 
    df[column] = df[column].apply(lambda x: re.sub(timestamp, '<TIME>', str(x))) 
    df[column] = df[column].apply(lambda x: re.sub(numbers_pattern, '<NUM>', str(x)))

    #removing multiple white spaces, tabs, or new lines
    df = df.applymap(lambda x: re.sub('\s+',' ', str(x)))

    return df

In [97]:
def count_unique_words_in_list(df, column):
    unique_words = set() 
    for string in df[column]:
        words = string.split()
        unique_words.update(words) 
    return len(unique_words)

def count_unique_words_in_list_guesser(df, column):
    unique_words = Counter()
    for string in df[column]:
        words = string.split()
        unique_words.update(words) 
    return unique_words

def count_uniques_after_clean(df, column):
    unique_words = Counter()
    for words in df[column]:
        for word in words:
            if isinstance(word, str) and word.isalpha():
                unique_words.update([word])
    return unique_words

Tokenizing, removing stopwords and stemming functions:

In [98]:
def tokenize_df(df):
    filtered_text = ''.join([char for char in str(df) if char.isalpha() or char.isspace() or char == '<' or char == '>'])
    return filtered_text.split()

In [99]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in sentence if word not in stop_words]
    return filtered_sentence

In [100]:
def stem_(sentence):
    lemmatizer = WordNetLemmatizer()
    stemmed_sentence = [lemmatizer.lemmatize(word) for word in sentence]
    return stemmed_sentence

In [101]:
def pre_process(df, column, tokeniz = False, stopw = False, stemming = False):
    if tokeniz:
        df[column] = df[column].apply(tokenize_df)
    if stopw:
        df[column] = df[column].apply(lambda x: remove_stopwords(x) if isinstance(x, list) else x)
    if stemming:
        df[column] = df[column].apply(lambda x: stem_(x) if isinstance(x, list) else x)
    return df


In [102]:
df_clean = clean_text(df, 'content')
words_uncl = count_unique_words_in_list(df_clean, "content")

In [103]:
df_clean = pre_process(df_clean, "content", tokeniz=True, stopw=True)
words_clean = len(count_uniques_after_clean(df_clean, "content"))

In [104]:
df_clean = pre_process(df_clean, "content", stemming=True)  
words_stemming = len(count_uniques_after_clean(df_clean, "content"))

In [105]:
reduction_stopwords = (1 - words_clean/words_uncl)*100
reduction_stemming  = (1 - words_stemming/words_clean)*100
print("Vocabulary before any preprocessing were:", words_uncl, "unique words")
print("Vocabulary when removing stopwords were:", words_clean, "unique words")
print("The reduction rate is", round(reduction_stopwords,2))
print("Vocabulary when stemming the text were:", words_stemming, "unique words")
print("Reduction rate is", round(reduction_stemming,2))

Vocabulary before any preprocessing were: 28968 unique words
Vocabulary when removing stopwords were: 19441 unique words
The reduction rate is 32.89
Vocabulary when stemming the text were: 17817 unique words
Reduction rate is 8.35


Part 1, Task 2 and Task 3:

In [106]:
def group_labels_995(dataframe):
    replace_fake = re.compile(r'(fake)|(junksci)')
    replace_reliable = re.compile(r'(reliable)|(political)')
    replace_omitted = re.compile(r'(hate)|(conspiracy)|(satire)|(rumor)|(2018-02-10 13:43:39.521661)|(unreliable)|(nan)|(unknown)|(bias)|(clickbait)|(type)')
    dataframe['type'] = dataframe['type'].apply(lambda x: re.sub(replace_fake, 'fake', str(x)))
    dataframe['type'] = dataframe['type'].apply(lambda x: re.sub(replace_reliable, 'reliable', str(x)))
    dataframe['type'] = dataframe['type'].apply(lambda x: re.sub(replace_omitted, 'omitted', str(x)))
    remove_omitted = dataframe[(dataframe['type'] == 'omitted')].index
    dataframe.drop(remove_omitted, inplace=True)
    return dataframe

In [107]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def process_data_in_chunks(file_path, chunk_size=10000):
    # Accumulate chunks into a list
    # Read the data in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Process the chunk as needed
        processed_chunk = clean_text(chunk, 'content')
        processed_chunk = pre_process(processed_chunk, "content", tokeniz= True, stopw= True, stemming=True)
        chunks = []
        chunks.append(processed_chunk)
        df = pd.concat(chunks, ignore_index=True)
        df.to_csv('995_cleaned_preprocessed.csv', mode='a', index= False)
        del processed_chunk
        del chunk
        del chunks
    return

In [108]:
process_data_in_chunks("995,000_rows.csv")

In [109]:
df_995_cleaned_preprocessed_explore = pd.read_csv('995_cleaned_preprocessed.csv')

def avg_type_length(df, type):
    n_type = 0
    type_length = 0
    for i in range(len(df)):
        if df["type"].iloc[i] == type:
            type_length += len(df["content"].iloc[i])
            n_type += 1
    avg_length = type_length / n_type if n_type != 0 else 0
    return avg_length

type_list = df_995_cleaned_preprocessed_explore["type"].unique()

print("Before grouping labels:")
for word in type_list:
    print(word, " length : ", avg_type_length(df_995_cleaned_preprocessed_explore, word))

df_995_cleaned_preprocessed_explore = group_labels_995(df_995_cleaned_preprocessed_explore)

print("\nAfter grouping labels have been applied:")

type_list = df_995_cleaned_preprocessed_explore["type"].unique()
for word in type_list:
    print(word, " length : ", avg_type_length(df_995_cleaned_preprocessed_explore, word))

Before grouping labels:
political  length :  3361.3220781624323
fake  length :  2660.054031635251
satire  length :  1574.5808510638299
reliable  length :  3057.8518191467947
conspiracy  length :  2347.6910927513
unreliable  length :  2232.731178535039
bias  length :  2833.724638225051
rumor  length :  1776.0850208167242
unknown  length :  4356.339252078835
nan  length :  0
clickbait  length :  2401.6675543557567
hate  length :  4541.02335117895
junksci  length :  3218.29245014245
type  length :  7.0
2018-02-10 13:43:39.521661  length :  80.0

After grouping labels have been applied:
reliable  length :  3200.7542642865096
fake  length :  2725.9594275287373


In [110]:
df_995_cleaned_preprocessed = pd.read_csv('995_cleaned_preprocessed.csv')

In [111]:
df_995_cleaned_preprocessed['type'].unique()

array(['political', 'fake', 'satire', 'reliable', 'conspiracy',
       'unreliable', 'bias', 'rumor', 'unknown', nan, 'clickbait', 'hate',
       'junksci', 'type', '2018-02-10 13:43:39.521661'], dtype=object)

Part 1, Task 4:

In [112]:
df_995_cleaned_preprocessed = group_labels_995(df_995_cleaned_preprocessed)

min_count = min(df_995_cleaned_preprocessed['type'].value_counts())

df_995_cleaned_preprocessed_balanced_simple = pd.concat([
    df_995_cleaned_preprocessed[df_995_cleaned_preprocessed['type'] == 'fake'].sample(n=min_count, random_state=1),
    df_995_cleaned_preprocessed[df_995_cleaned_preprocessed['type'] == 'reliable'].sample(n=min_count, random_state=1)
])

df_995_cleaned_preprocessed_balanced_simple = pre_process(df_995_cleaned_preprocessed_balanced_simple, "content")
df_995_cleaned_preprocessed_balanced_simple["wordcount"] = df_995_cleaned_preprocessed_balanced_simple["content"].apply(len)

X_simple = df_995_cleaned_preprocessed_balanced_simple[["wordcount"]]

labels_simple = []
for i in df_995_cleaned_preprocessed_balanced_simple['type']:
    if i == 'fake':
        labels_simple.append(1)
    else:
        labels_simple.append(0)
y_simple = labels_simple

X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(X_simple, y_simple, test_size=0.2, random_state=42)

Part 2, Task 1:

In [113]:
def count_uniques(df, column):
    unique_words = Counter()
    for words in df[column]:
        for word in words:
            if isinstance(word, str) and word.isalpha() and len(word) > 4:
                unique_words.update([word])
    return unique_words

In [114]:
def simple_guesser(df, column, type, liar=False):
    
    if liar == True:
        df_fake = df[df[type] == 'fake']
        df_reliable = df[df[type] == 'reliable']
        xyz = count_uniques_after_clean(df_fake, column)
        zyx = count_uniques_after_clean(df_reliable, column)
        top_50_fake = xyz.most_common(200)
        top_50_reliable= zyx.most_common(200)
    
        fake_50 = [str_[0] for str_ in top_50_fake]
        reliable_50 = [str_[0] for str_ in top_50_reliable]
        df["fakeCount"] = df[column].apply(lambda x: sum(word in fake_50 for word in x))
        df["reliableCount"] = df[column].apply(lambda x: sum(word in reliable_50 for word in x))
        df["simple guess"] = np.where(df["fakeCount"] > df["reliableCount"], "fake", "reliable")
        count_fake = (df[type] == "fake").sum()
        counter = ((df[type] == "fake") & (df["simple guess"] == "fake")).sum()
        accuracy = counter / count_fake
        return accuracy
        
    else:
        df_fake = df[df['type'] == 'fake']
        df_reliable = df[df['type'] == 'reliable']
        xyz = count_unique_words_in_list_guesser(df_fake, column)
        zyx = count_unique_words_in_list_guesser(df_reliable, column)
        top_50_fake = xyz.most_common(200)
        top_50_reliable= zyx.most_common(200)

        fake_50 = [str_[0] for str_ in top_50_fake]
        reliable_50 = [str_[0] for str_ in top_50_reliable]
        df["fakeCount"] = df[column].apply(lambda x: sum(word in fake_50 for word in x.split()))
        df["reliableCount"] = df[column].apply(lambda x: sum(word in reliable_50 for word in x.split()))
        df["simple guess"] = np.where(df["fakeCount"] > df["reliableCount"], "fake", "reliable")
        count_fake = (df["type"] == "fake").sum()
        counter = ((df["type"] == "fake") & (df["simple guess"] == "fake")).sum()
        accuracy = counter / count_fake
        return accuracy

In [115]:
model_simple = LogisticRegression()
model_simple.fit(X_train_simple, y_train_simple)
y_pred_simple = model_simple.predict(X_test_simple)
accuracy_simple = accuracy_score(y_test_simple, y_pred_simple)
print(accuracy_simple)

0.5326045827202018


In [116]:
df_995_cleaned_preprocessed_simple = pd.read_csv('995_cleaned_preprocessed.csv', nrows = 10000)

In [117]:
df_995_cleaned_preprocessed_simple = group_labels_995(df_995_cleaned_preprocessed_simple)

In [118]:
print(simple_guesser(df_995_cleaned_preprocessed_simple, "content", "type"))

0.7611749680715197


Part 2, Task 3:

In [119]:
df_995_cleaned_preprocessed_bbc = pd.read_csv('995_cleaned_preprocessed.csv', low_memory=False)

In [120]:
bbc_articles = pd.read_csv('bbc_articles.csv')

In [121]:
df_bbc = pd.DataFrame({'content': bbc_articles.Text})

In [122]:
df_995_cleaned_preprocessed_bbc = pd.concat([df_995_cleaned_preprocessed_bbc, df_bbc], ignore_index=True)
df_995_cleaned_preprocessed_bbc.loc[len(df_995_cleaned_preprocessed_bbc)-len(df_bbc):, 'type'] = 'reliable'

In [123]:
df_995_cleaned_preprocessed_bbc = group_labels_995(df_995_cleaned_preprocessed_bbc)

min_count_bbc = min(df_995_cleaned_preprocessed_bbc['type'].value_counts())

df_995_cleaned_preprocessed_balanced_simple_bbc = pd.concat([
    df_995_cleaned_preprocessed_bbc[df_995_cleaned_preprocessed_bbc['type'] == 'fake'].sample(n=min_count_bbc, random_state=1),
    df_995_cleaned_preprocessed_bbc[df_995_cleaned_preprocessed_bbc['type'] == 'reliable'].sample(n=min_count_bbc, random_state=1)
])

df_995_cleaned_preprocessed_balanced_simple_bbc = pre_process(df_995_cleaned_preprocessed_balanced_simple_bbc, "content")
df_995_cleaned_preprocessed_balanced_simple_bbc["wordcount"] = df_995_cleaned_preprocessed_balanced_simple_bbc["content"].apply(len)

X_simple_bbc = df_995_cleaned_preprocessed_balanced_simple_bbc[["wordcount"]]

labels_simple_bbc = []
for i in df_995_cleaned_preprocessed_balanced_simple_bbc['type']:
    if i == 'fake':
        labels_simple_bbc.append(1)
    else:
        labels_simple_bbc.append(0)
y_simple_bbc = labels_simple_bbc

X_train_simple_bbc, X_test_simple_bbc, y_train_simple_bbc, y_test_simple_bbc = train_test_split(X_simple_bbc, y_simple_bbc, test_size=0.2, random_state=42)

In [124]:
model_simple_bbc = LogisticRegression()
model_simple_bbc.fit(X_train_simple_bbc, y_train_simple_bbc)
y_pred_simple_bbc = model_simple_bbc.predict(X_test_simple_bbc)
accuracy_simple_bbc = accuracy_score(y_test_simple_bbc, y_pred_simple_bbc)
print(accuracy_simple_bbc)

0.5321000630649569


Part 3: Advanced Model

In [125]:
n_fake = df_995_cleaned_preprocessed[df_995_cleaned_preprocessed['type'] == "fake"].shape[0]
reliable_sampled = df_995_cleaned_preprocessed[df_995_cleaned_preprocessed['type'] == "reliable"].sample(n=n_fake)

balanced_df_advanced = pd.concat([df_995_cleaned_preprocessed[df_995_cleaned_preprocessed['type'] == "fake"], reliable_sampled])

tfidf_vectorizer = TfidfVectorizer() 
X_advanced = tfidf_vectorizer.fit_transform(balanced_df_advanced["content"])
feature_names = tfidf_vectorizer.get_feature_names_out()

advanced_labels = []
for i in balanced_df_advanced['type']:
    if i == 'fake':
        advanced_labels.append(1)
    else:
        advanced_labels.append(0)
y_advanced = advanced_labels


X_train_advanced, X_test_advanced, y_train_advanced, y_test_advanced = train_test_split(X_advanced, y_advanced, test_size=0.2, random_state=0)
X_val_advanced, X_test_advanced, y_val_advanced, y_test_advanced = train_test_split(X_test_advanced, y_test_advanced, test_size=0.5, random_state=0)

In [126]:
logReg_pipe = Pipeline([
    ('logistic', LogisticRegression(max_iter=10000))
])
logReg_parameters = [
    
    {'logistic__penalty': ['l2'],
    'logistic__C': [0.1, 1, 10],
    'logistic__solver': 'lbfgs'},

    {'logistic__penalty': ['l1', 'l2'],
    'logistic__C': [0.1, 1, 10],
    'logistic__solver': 'liblinear'},                 
    
    {'logistic__penalty': ['l2'],
    'logistic__C': [0.1, 1, 10],
    'logistic__solver': 'newton-cg'}, 

    {'logistic__penalty': ['l2'],
    'logistic__C': [0.1, 1, 10],
    'logistic__solver': 'sag'}
    
    ]
logReg_best_accuracy = 0
logReg_best_hyperparameter = ''
for param in logReg_parameters:
    for penalt in param['logistic__penalty']:
        if penalt == 'none':
            reg = logReg_pipe.set_params(logistic__penalty=penalt, logistic__solver=param['logistic__solver']).fit(X_train_advanced, y_train_advanced)
            val_pred = reg.predict(X_val_advanced)
            val_acc = accuracy_score(y_val_advanced, val_pred)
            if val_acc > logReg_best_accuracy:
                logReg_best_accuracy = val_acc
                logReg_best_hyperparameter = f"penalty: {penalt}, C: default, solver: {param} "
        else:
            for C_ in param['logistic__C']:
                reg = logReg_pipe.set_params(logistic__solver=param['logistic__solver'], logistic__penalty=penalt, logistic__C=C_).fit(X_train_advanced, y_train_advanced)
                val_pred = reg.predict(X_val_advanced)
                val_acc = accuracy_score(y_val_advanced, val_pred)
                if val_acc > logReg_best_accuracy:
                    logReg_best_accuracy = val_acc
                    logReg_best_hyperparameter = f"penalty: {penalt}, C: {C_}, solver: {param['logistic__solver']}"
print("Best parameters logReg:", logReg_best_hyperparameter)
print("Best accuracy logReg:", logReg_best_accuracy)

Best parameters logReg: penalty: l1, C: 1, solver: liblinear
Best accuracy logReg: 0.9129283161656506


In [127]:
nb_pipe = Pipeline([
    ('bernoulli_nb', BernoulliNB())
])

nb_parameters = [
    {'bernoulli_nb__alpha': [0.1, 1.0, 10.0], 'bernoulli_nb__binarize': [0.0, 0.5, 1.0]},
    {'bernoulli_nb__alpha': [0.1, 1.0, 10.0], 'bernoulli_nb__binarize': [None]}
]

nb_best_accuracy = 0
nb_best_hyperparameter = ''

for param in nb_parameters:
    for alpha_ in param['bernoulli_nb__alpha']:
        for binarize_ in param['bernoulli_nb__binarize']:
            reg = nb_pipe.set_params(bernoulli_nb__alpha=alpha_, bernoulli_nb__binarize=binarize_).fit(X_train_advanced, y_train_advanced)
            val_pred = reg.predict(X_val_advanced)
            val_acc = accuracy_score(y_val_advanced, val_pred)
            if val_acc > nb_best_accuracy:
                nb_best_accuracy = val_acc
                nb_best_hyperparameter = f"alpha: {alpha_}, binarize: {binarize_}"

print("Best parameters NB:", nb_best_hyperparameter)
print("Best accuracy NB:", nb_best_accuracy)

Best parameters NB: alpha: 0.1, binarize: None
Best accuracy NB: 0.8462896783687198


Part 4, Task 1:

In [128]:
best_LogisticRegression_model = LogisticRegression(penalty= 'l1', C = 1, solver = 'liblinear').fit(X_train_advanced, y_train_advanced)
best_LogisticRegression_predict = best_LogisticRegression_model.predict(X_test_advanced)
best_LogisticRegression_accuracy = accuracy_score(y_test_advanced, best_LogisticRegression_predict)
print(best_LogisticRegression_accuracy)

0.9126760563380282


In [129]:
best_BernoulliNB_model = BernoulliNB(alpha = 0.1, binarize=None).fit(X_train_advanced, y_train_advanced)
best_BernoulliNB_predict = best_BernoulliNB_model.predict(X_test_advanced)
best_BernoulliNB_accuracy = accuracy_score(y_test_advanced, best_BernoulliNB_predict)
print(best_BernoulliNB_accuracy)

0.8444397729661551


Part 4, Task2 and Task 3:

In [130]:
def group_labels_liar(dataframe):
    replace_fake = re.compile(r'(false)|(barely-true)|(pants-fire)')
    replace_reliable = re.compile(r'(true)|(mostly-true)')
    replace_omitted = re.compile(r'(half-true)|(half-reliable)')
    dataframe[1] = dataframe[1].apply(lambda x: re.sub(replace_fake, 'fake', str(x)))
    dataframe[1] = dataframe[1].apply(lambda x: re.sub(replace_reliable, 'reliable', str(x)))
    dataframe[1] = dataframe[1].apply(lambda x: re.sub(replace_omitted, 'omitted', str(x)))
    remove_omitted = dataframe[(dataframe[1] == 'omitted')].index
    dataframe.drop(remove_omitted, inplace=True)
    return dataframe

In [131]:
def calc_labels(dataframe, column):
    labels = []
    for i in dataframe[column]:
        if i == 'fake':
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [132]:
df_liar_simplev1 = pd.read_csv("test.tsv", sep="\t", header=None)

In [133]:
df_liar_simplev1 = clean_text(df_liar_simplev1, 2)
df_liar_simplev1 = group_labels_liar(df_liar_simplev1)
df_liar_simplev1 = pre_process(df_liar_simplev1, 2, tokeniz=True, stopw=True, stemming=True)

Simple

In [134]:
print("simple model v1:", simple_guesser(df_liar_simplev1, 2, 1, liar=True))

simple model v1: 0.49547920433996384


In [135]:
df_liar_simplev2 = pd.read_csv("test.tsv", sep="\t", header=None)
df_liar_simplev2 = clean_text(df_liar_simplev2, 2)
df_liar_simplev2 = group_labels_liar(df_liar_simplev2)
df_liar_simplev2 = pre_process(df_liar_simplev2, 2, tokeniz=True, stopw=True, stemming=True)
liar_labels_simplev2 = calc_labels(df_liar_simplev2, 1)

df_liar_simplev2["wordcount"] = df_liar_simplev2[2].apply(len)

liar_X_test_simplev2 = df_liar_simplev2[["wordcount"]]

simple_model_liarv2 = model_simple.predict(liar_X_test_simplev2)
accuracy_simple_liarv2 = accuracy_score(liar_labels_simplev2, simple_model_liarv2)
print("simple model v2:", accuracy_simple_liarv2)

simple model v2: 0.5518962075848304


Advanced

In [136]:
df_liar_advanced = pd.read_csv("test.tsv", sep="\t", header=None)

In [137]:
df_liar_advanced = clean_text(df_liar_advanced, 2)
df_liar_advanced = group_labels_liar(df_liar_advanced)
liar_labels_advanced = calc_labels(df_liar_advanced, 1)

In [138]:
liar_X_test_advanced = tfidf_vectorizer.transform(df_liar_advanced[2])

In [139]:
logistic_liar_y_pred = best_LogisticRegression_model.predict(liar_X_test_advanced)
logistic_liar_accuracy = accuracy_score(liar_labels_advanced, logistic_liar_y_pred)
print(logistic_liar_accuracy)

0.4540918163672655


In [140]:
NB_liar_y_pred = best_BernoulliNB_model.predict(liar_X_test_advanced)
NB_liar_accuracy = accuracy_score(liar_labels_advanced, NB_liar_y_pred)
print(NB_liar_accuracy)

0.5499001996007984
