In [1]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

def dummy_model(X_train, y_train, X_test, y_test):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = DummyClassifier(strategy="most_frequent")
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(accuracy_score(y_test, y_pred))

    return dummy_clf

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def logistic_regression(X_train, y_train, X_test, y_test, solver, penalty, c):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver, penalty=penalty, c=c)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} with penalty {penalty} with c = {c}: {accuracy_score(y_test, y_pred)}')

    return dummy_clf

In [17]:
def logistic_regression_none(X_train, y_train, X_test, y_test, solver):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} without penalty: {accuracy_score(y_test, y_pred)}')

    return dummy_clf

In [4]:
import pandas as pd
import re
def load_data(input, nrows=None, columns=None):
    # load from csv
    df = pd.read_csv(input, nrows=nrows, usecols=columns)
    pattern = re.compile(r'(?:\'|")([a-z0-9"\.+-/_=:`|~©™®▪♦🙂 ]*)(?:\'|"),')
    df['content'] = df['0'].apply(lambda x: pattern.findall(x))
    df.drop(['0'], axis=1, inplace=True)
    return df

In [5]:
from sklearn.model_selection import train_test_split
def split_data(input, answers, test_size=0.1, val_size=0.1):
    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(input, answers, test_size=test_size+val_size, random_state=42)
    # split the test data into test and validation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_size/(test_size+val_size), random_state=42)
    return X_train, X_test, y_train, y_test, X_val, y_val

In [6]:
def pad_series(series, maxlen):
    # truncate the series
    series = series.apply(lambda x: x[:maxlen])
    # pad the series
    return series.apply(lambda x: x + ['']*(maxlen-len(x)))

In [12]:
def load_vocab(input):
    vocab = pd.read_pickle('D:/Fake News Project/file.pkl')
    # the vocab is a list of tuples (word, count)
    # filter out words that appear less than 2000 times
    vocab = [word for word, count in vocab if count > 2000]
    return vocab

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
bow_converter = CountVectorizer(vocabulary=load_vocab('D:/Fake News Project/file.pkl'))
def bow_transformer(series):
    # transform the series into bag of words
    return bow_converter.transform(series.apply(lambda x: ' '.join(x)))




In [9]:
#loading data
df = load_data('D:/DS_fake_news/fake_news_cleaned_filtered.csv', nrows=10000)  


In [10]:
types_df =  pd.read_csv('D:/DS_fake_news/fake_news_cleaned.csv', nrows=10000, usecols=['type'])
df['type'] = types_df['type']

In [11]:
# convert types to unique numbers
df['type'] = df['type'].astype('category')
df['type'] = df['type'].cat.codes

In [12]:
bow = bow_transformer(df['content'])

In [13]:
#splitting data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(bow, df['type'], test_size=0.1, val_size=0.1)

In [14]:
#display(X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape)


In [15]:
dummy_model(X_train, y_train, X_test, y_test)

0.16


In [15]:
#logistic_regression(X_train, y_train, X_test, y_test, solver, penalty, c)
#logistic_regression_none(X_train, y_train, X_test, y_test, solver):
logistic_regression_none(X_train, y_train, X_test, y_test, 'lbfgs')
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'liblinear')
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'newton-cg')
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'newton-cholesky')
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'sag')
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'saga')
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 1)

SyntaxError: invalid syntax (2678607093.py, line 4)

In [None]:
## some preprocessing scripts:
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )


# split the data in chunks and run in parallel
from joblib import Parallel, delayed
from os import cpu_count

# run in parallel
def run_parallel(df, n_jobs, func):
    # call every element in the chunks in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(func)(element) for element in df)
    return results


# clean the text
def clean_column(df):
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, clean_text)
    # replace column with cleaned text
    return results


from nltk.tokenize import word_tokenize
# tokenize the text. run in parallel
def tokenize_column(df):
    # run the function on the data
    n_jobs = cpu_count()
    results = run_parallel(df['content'], n_jobs, word_tokenize)
    
    return results

from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))

    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results

# %%
# stemming the text
from nltk.stem import PorterStemmer
def stem_column(df):
    # create a stemmer
    ps = PorterStemmer()

    # stem the text
    def stem(s):
        return [ps.stem(w) for w in s]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, stem)
    return results

# %%
# remove punctiuation
import string
def remove_punctuation(df):
    # remove punctuation
    def remove_punct(s):
        return [w for w in s if w not in string.punctuation]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, remove_punct)
    return results

In [None]:
# load in the LIAR dataset
import pandas as pd
df = pd.read_csv('E:/ML/DS_fake_news/train.tsv', sep='\t', header=None, usecols=[1,2])
liar_labels = {
    'false': 1,
    'pants-fire': 1,
    'barely-true': 1,
    'mostly-true': 0,
    'true': 0,
    'half-true': 0
}
df[1] = df[1].map(liar_labels)

In [None]:
# rename the columns
df.columns = ['type', 'content']

In [None]:
df['content'] = clean_column(df['content'])
df['content'] = tokenize_column(df)
df['content'] = remove_stopwords(df['content'])
df['content'] = stem_column(df['content'])
df['content'] = remove_punctuation(df['content'])
