In [1]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

def dummy_model(X_train, y_train, X_test, y_test):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = DummyClassifier(strategy="most_frequent")
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(accuracy_score(y_test, y_pred))

    return dummy_clf

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def logistic_regression(X_train, y_train, X_test, y_test, solver, penalty, c):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver, penalty=penalty, c=c)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} with penalty {penalty} with c = {c}: {accuracy_score(y_test, y_pred)}')

    return dummy_clf

In [17]:
def logistic_regression_none(X_train, y_train, X_test, y_test, solver):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} without penalty: {accuracy_score(y_test, y_pred)}')

    return dummy_clf

In [4]:
import pandas as pd
import re
def load_data(input, nrows=None, columns=None):
    # load from csv
    df = pd.read_csv(input, nrows=nrows, usecols=columns)
    pattern = re.compile(r'(?:\'|")([a-z0-9"\.+-/_=:`|~©™®▪♦🙂 ]*)(?:\'|"),')
    df['content'] = df['0'].apply(lambda x: pattern.findall(x))
    df.drop(['0'], axis=1, inplace=True)
    return df

In [5]:
from sklearn.model_selection import train_test_split
def split_data(input, answers, test_size=0.1, val_size=0.1):
    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(input, answers, test_size=test_size+val_size, random_state=42)
    # split the test data into test and validation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_size/(test_size+val_size), random_state=42)
    return X_train, X_test, y_train, y_test, X_val, y_val

In [6]:
def pad_series(series, maxlen):
    # truncate the series
    series = series.apply(lambda x: x[:maxlen])
    # pad the series
    return series.apply(lambda x: x + ['']*(maxlen-len(x)))

In [12]:
def load_vocab(input):
    vocab = pd.read_pickle('D:/Fake News Project/file.pkl')
    # the vocab is a list of tuples (word, count)
    # filter out words that appear less than 2000 times
    vocab = [word for word, count in vocab if count > 2000]
    return vocab

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
bow_converter = CountVectorizer(vocabulary=load_vocab('D:/Fake News Project/file.pkl'))
def bow_transformer(series):
    # transform the series into bag of words
    return bow_converter.transform(series.apply(lambda x: ' '.join(x)))




In [9]:
#loading data
df = load_data('D:/DS_fake_news/fake_news_cleaned_filtered.csv', nrows=10000)  


In [10]:
types_df =  pd.read_csv('D:/DS_fake_news/fake_news_cleaned.csv', nrows=10000, usecols=['type'])
df['type'] = types_df['type']

In [11]:
# convert types to unique numbers
df['type'] = df['type'].astype('category')
df['type'] = df['type'].cat.codes

In [12]:
bow = bow_transformer(df['content'])

In [13]:
#splitting data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(bow, df['type'], test_size=0.1, val_size=0.1)

In [14]:
#display(X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape)


In [15]:
dummy_model(X_train, y_train, X_test, y_test)

0.16


In [15]:
#logistic_regression(X_train, y_train, X_test, y_test, solver, penalty, c)
#logistic_regression_none(X_train, y_train, X_test, y_test, solver):
logistic_regression_none(X_train, y_train, X_test, y_test, 'lbfgs')
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'lbfgs', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'liblinear')
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'newton-cg')
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cg', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'newton-cholesky')
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'newton-cholesky', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'sag')
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'sag', 'l2', 1)
logistic_regression_none(X_train, y_train, X_test, y_test, 'saga')
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.001)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.01)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 0.1)
logistic_regression(X_train, y_train, X_test, y_test, 'saga', 'l2', 1)

SyntaxError: invalid syntax (2678607093.py, line 4)