In [41]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
import nltk
#nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [42]:
#Load Dataset
df = pd.read_csv('raw_dataset - Sheet1.csv')

In [43]:
df

Unnamed: 0,index,title,author,text,publisher,datepublished,link
0,1,The Marcoses' three-body problem,John Nery,The success of the Marcos-Duterte electoral al...,Rappler,4/1/2024,https://www.rappler.com/voices/thought-leaders...
1,2,State of the opposition: Worth dying for,John Nery,The war on Ninoy Aquino’s legacy is a fight th...,Rappler,8/30/2022,https://www.rappler.com/voices/thought-leaders...
2,3,Elon Musk and magical thinking,John Nery,One of many lessons from the ongoing Twitter f...,Rappler,11/25/2022,https://www.rappler.com/voices/thought-leaders...
3,4,So what's wrong with Marcos's F1 party?,John Nery,Of course we all need to be particular about t...,Rappler,10/06/2022,https://www.rappler.com/voices/thought-leaders...
4,5,Patricia Evangelista and writing the war,John Nery,"In 'Some People Need Killing,' the acclaimed j...",Rappler,10/18/2023,https://www.rappler.com/life-and-style/literat...
5,6,What do we do about Alice Guo,John Nery,The answers she gave in two Senate hearings we...,Rappler,05/29/2024,https://www.rappler.com/voices/thought-leaders...
6,7,The media is not the press,John Nery,Confusing one with the other allows disinforma...,Rappler,04/13/2024,https://www.rappler.com/voices/thought-leaders...
7,8,The paradoxical pope,John Nery,The complicated but lasting legacy of Benedict...,Rappler,01/04/2023,https://www.rappler.com/voices/thought-leaders...
8,9,What's next for VP Sara: Go Macapagal or go Ar...,John Nery,Now that she has passed the point of no return...,Rappler,05/21/2024,https://www.rappler.com/voices/thought-leaders...
9,10,Time to negotiate peace again with the CPP-NPA,John Nery,The communist insurgency is at its weakest. Co...,Rappler,05/11/2024,https://www.rappler.com/voices/thought-leaders...


# Data Preprocessing

In [55]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    return text

# Feature extraction function
def extract_features(text):
    # Tokenize words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    # Exclude special characters from word count but retain them in the text
    # Count only alphabetic tokens as words
    word_count = len([word for word in words if word.isalpha()])
    
    # Calculate phraseology features
    avg_word_length = np.mean([len(word) for word in words if word.isalpha()]) if word_count > 0 else 0
    avg_sentence_length = word_count / len(sentences) if len(sentences) > 0 else 0
    
    # Calculate punctuation features
    punctuation_count = sum(1 for char in text if char in ['.', ',', ':', ';', '?', '!', '-', '\'', '\"'])
    punctuation_ratio = punctuation_count / word_count if word_count > 0 else 0
    
    # Calculate linguistic diversity features
    vocab_size = len(set([word for word in words if word.isalpha()]))
    vocab_ratio = vocab_size / word_count if word_count > 0 else 0
    
    return {
        'word_count': word_count,
        'sentence_count': len(sentences),
        'avg_word_length': avg_word_length,
        'avg_sentence_length': avg_sentence_length,
        'punctuation_count': punctuation_count,
        'punctuation_ratio': punctuation_ratio,
        'vocab_size': vocab_size,
        'vocab_ratio': vocab_ratio
    }

In [56]:
# Apply preprocessing and feature extraction to each text sample
df['preprocessed_text'] = df['text'].apply(preprocess_text)
df_features = df.apply(lambda row: extract_features(row['preprocessed_text']), axis=1).apply(pd.Series)
df = pd.concat([df, df_features], axis=1)

In [58]:
# Access the original text in the first row (index 0)
original_text = df.loc[0, 'text']

# Access the preprocessed text in the first row (index 0)
preprocessed_text = df.loc[0, 'preprocessed_text']

# Access the extracted features for the first row
features_row_1 = df.loc[0, ['word_count', 'sentence_count', 'avg_word_length', 'avg_sentence_length',
                            'punctuation_count', 'punctuation_ratio', 'vocab_size', 'vocab_ratio']]

# Print the results for comparison
print("Original Text:")
print(original_text)
print("\nPreprocessed Text:")
print(preprocessed_text)
print("\nExtracted Features:")
print(features_row_1.to_dict())

Original Text:
The success of the Marcos-Duterte electoral alliance in 2022 was so massive it could be said that the political equivalent of Newton’s law of inertia applied to it. A body moving at constant speed in a straight line will continue moving at constant speed in a straight line unless a force acted upon it. The alliance – if all went well, if the different parts moved at the same speed and kept to the same line – had a lock on the 2028 presidential election, which in our part of the universe determines other political possibilities. And then a force acted upon it. Now the alliance has descended into a war of the dynasties, between the Marcoses and the Dutertes, and an opening is starting to form, not necessarily for the political opposition but perhaps for a political third force led by someone like the populist, and popular, Senator Raffy Tulfo.But the reality of politics has always been messy; perhaps Newton’s laws of motion cannot really explain it. For some analysts, the 