In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

**Preprocessing Data**

In [140]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [141]:
df = pd.read_json('datasets/news.json').drop('source',axis=1)

In [142]:
df.head()

Unnamed: 0,article,orientation
0,Health authorities in one state have issued an...,western_conservative
1,\n'Kennedy Saves the World' podcast host Kenne...,western_conservative
2,\nFormer counterterrorism analyst Jonathan Sch...,western_conservative
3,\nFox News Flash top headlines are here. Check...,western_conservative
4,\nCrowe is charged with harassment and stalkin...,western_conservative


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164 entries, 0 to 1163
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article      1164 non-null   object
 1   orientation  1164 non-null   object
dtypes: object(2)
memory usage: 18.3+ KB


In [144]:
df['orientation'].unique()

array(['western_conservative', 'non_western', 'western_progressive'],
      dtype=object)

In [145]:
# Turn categories into numbers

# Define the categorical features
categorical_features = ['orientation']

# Initialize the OneHotEncoder
one_hot = OneHotEncoder()

# Initialize the ColumnTransformer
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

# Apply the transformation to your dataframe
df_transformed = transformer.fit_transform(df)
df_transformed[:3]

array([[0.0, 1.0, 0.0,
        'Health authorities in one state have issued an urgent alert for residents who visited a Costco, DFO, businesses and caught trams after two measles cases were infectious while in public.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nVictorian residents have been put on alert after two holidaymakers returning from overseas were unknowingly infectious with measles while out in the community.\nThe Department of Health revealed the new cases on Saturday afternoon, which brings the total measles cases to three after another traveller was identified this week.\nAt least 10 exposure sites have been listed, with the days ranging between Wednesday January 17 and Wednesday January 24, on the department\'s website.\nWant more news? Stream Sky News Australia’s live channel here. \nWednesday January 17 \n6am to 3pm: Bay City Auto Group (and associated construction site) 14 Dandenong Road West, Frankston\n7:30pm to 9pm: Box Hill Action Indoor Sports 9 Clarice Road, Box Hill

In [146]:
# Turning it into
data = pd.DataFrame(df_transformed)
data.columns = ['western_conservative','non_western','western_progressive','article']

# Removing \n and pre-word-embedding cleaning
char = '\\'
data['article'] = data['article'].str.replace('"','')
data['article'] = data['article'].str.replace("'","")
data['article'] = data['article'].str.replace(',','')
data['article'] = data['article'].str.replace('.','')
data['article'] = data['article'].str.lower()
data['article'] = data['article'].str.replace('\n','')
data['article'] = data['article'].str.replace(char,'')
data['article'] = data['article'].str.replace('/','')
data['article'] = data['article'].str.replace('—','')
data['article'] = data['article'].str.replace('_','')
data['article'] = data['article'].str.replace('’','')
data['article'] = data['article'].str.replace('-','')
data['article'] = data['article'].str.replace('@','')
data['article'] = data['article'].str.replace('–','')
data['article'] = data['article'].str.replace('‘','')
data['article'] = data['article'].str.replace('…','')
data['article'] = data['article'].str.replace('”','')
data['article'] = data['article'].str.replace('“','')
data['article'] = data['article'].str.replace(':','')
data['article'] = data['article'].str.replace('!','')
data['article'] = data['article'].str.replace('?','')
data['article'] = data['article'].str.replace('^','')
data['article'] = data['article'].str.replace('<','')
data.head()

Unnamed: 0,western_conservative,non_western,western_progressive,article
0,0.0,1.0,0.0,health authorities in one state have issued an...
1,0.0,1.0,0.0,kennedy saves the world podcast host kennedy a...
2,0.0,1.0,0.0,former counterterrorism analyst jonathan schan...
3,0.0,1.0,0.0,fox news flash top headlines are here check ou...
4,0.0,1.0,0.0,crowe is charged with harassment and stalking ...


In [147]:
# Removing stopwrods for word-embedding
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [148]:
data['article'] = data['article'].apply(remove_stopwords)

In [149]:
data.head()

Unnamed: 0,western_conservative,non_western,western_progressive,article
0,0.0,1.0,0.0,health authorities one state issued urgent ale...
1,0.0,1.0,0.0,kennedy saves world podcast host kennedy fox n...
2,0.0,1.0,0.0,former counterterrorism analyst jonathan schan...
3,0.0,1.0,0.0,fox news flash top headlines check whats click...
4,0.0,1.0,0.0,crowe charged harassment stalking related acti...


In [150]:
# Saving cleaned data
data.to_csv('news_clean.csv',index=False)

***Word-Embedding***

In [163]:
data = pd.read_csv('datasets/news_clean.csv').dropna()
data.shape

(1162, 4)

In [164]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# tokenizing text data
tokenized_data = [simple_preprocess(article) for article in data['article']]

# training Word2Vec model
# using recommended parameters
word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)


# retrieving word vectors for each token in the article
word_vectors = []
for tokens in tokenized_data:
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if vectors:
        article_vector = sum(vectors) / len(vectors)  # average the word vectors to get one vector per article
        word_vectors.append(article_vector)
    else:
        word_vectors.append(None)  # handle case where all tokens are out-of-vocabulary

# converting word_vectors to pandas series
word_vectors_series = pd.Series(word_vectors, name='word_embeddings')

# adding  the word vectors as a new column in your DataFrame
data['word_embeddings'] = word_vectors_series

In [165]:
data.head()

Unnamed: 0,western_conservative,non_western,western_progressive,article,word_embeddings
0,0.0,1.0,0.0,health authorities one state issued urgent ale...,"[-0.5961671, 0.14073516, -0.049852327, -0.4266..."
1,0.0,1.0,0.0,kennedy saves world podcast host kennedy fox n...,"[-0.89580506, 0.21152776, -0.11414346, -0.5553..."
2,0.0,1.0,0.0,former counterterrorism analyst jonathan schan...,"[-0.70675737, 0.26028955, -0.012638905, -0.601..."
3,0.0,1.0,0.0,fox news flash top headlines check whats click...,"[-1.009354, 0.24867468, -0.29608953, -0.782901..."
4,0.0,1.0,0.0,crowe charged harassment stalking related acti...,"[-0.7127712, 0.23845907, -0.058488887, -0.5192..."


In [133]:
# Right now the arrays are Series containing whole strings
# Converting to lists with floats:
import ast

def convert_str(x):
    
    neu = ast.literal_eval(x)
    
    return neu

data = data.dropna(axis=0)

data['word_embeddings'] = data['word_embeddings'].apply(convert_str)

In [None]:
type(data['word_embeddings'][0])
# List

In [123]:
type(data['word_embeddings'][0][0])
# Float

float

In [166]:
# Saving the word-embedded dataframe
data.to_csv('news_embedded.csv',index=False)

_____________________
**Model Fitting and Evaluation**

The problem is a Text Classification Problem. 

For this kind of problem suitable models could be:

- Naive Bayes

- Support Vector Machines (SVM)

- Random Forest or Gradient Boosting Machines

- Neural Networks: deep learning models like Convolutional Neural Networks (CNNs) or Recurrent Neural Networks (RNNs).


However, since at the time of making this model the dataset has a limited number of samples, I will be focusing on the Naive Bayes model.

In [145]:
data = pd.read_csv('datasets/news_embedded.csv')

In [154]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# Concatenating the label columns into a single label column
data['label'] = data[['western_conservative', 'non_western', 'western_progressive']].idxmax(axis=1)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['article'], data['label'], test_size=0.2, random_state=42)

# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create the model and fitting it to the data
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Making predictions
y_pred = nb_classifier.predict(X_test_vectorized)

In [155]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9267241379310345


In [165]:
# Getting predicted probabilities
proba = nb_classifier.predict_proba(X_test_vectorized)

# Creating a DataFrame to display results
results_data = pd.DataFrame({'Article': X_test, 'Predicted Label': y_pred})
for i, label in enumerate(nb_classifier.classes_):
    results_data[label + ' Probability'] = proba[:, i]

# Printing the results
print(results_data.iloc[90])

Article                             former president donald trump seeking sweeping...
Predicted Label                                                   western_progressive
non_western Probability                                                           0.0
western_conservative Probability                                                  0.0
western_progressive Probability                                                   1.0
Name: 865, dtype: object
