In [35]:
import pandas as pd
import nltk
nltk.download('punkt')  # Download necessary models

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manuelnunezmartinez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
center = pd.read_csv('clean_data/moderate_outlets.csv')
right = pd.read_csv('clean_data/right_outlets.csv')
left = pd.read_csv('clean_data/left_outlets.csv')

In [41]:
# prepare all dataframes for training and testing 
center = center[['summary']]
center['class'] = 1

left = left[['summary']]
left['class'] = 0

right = right[['summary']]
right['class'] = 2

In [42]:
right.head()
left.head()
center.head()

Unnamed: 0,summary,class
0,WASHINGTON (AP) — President Donald Trump urged...,1
1,WASHINGTON (AP) — The Justice Department says ...,1
2,"NASHUA, N.H. (AP) — Christina Darling finally ...",1
3,Vice President Kamala Harris is expected to ho...,1
4,WASHINGTON (AP) — Senators are racing to seal ...,1


In [43]:
# The function below strips away the first three and last three sentences from every article. 
def strip_sentences(article):
    # Tokenize the article into sentences
    sentences = sent_tokenize(article)
    # Remove the first 3 and last 3 sentences
    stripped_sentences = sentences[3:-3]
    # Join the remaining sentences back into a single string
    return ' '.join(stripped_sentences)

In [44]:
# Remove Nan Entries from the each sata frame 
center = center[center['summary'].apply(type) == str]
left = left[left['summary'].apply(type) == str]
right = right[right['summary'].apply(type) == str]

In [45]:
# remove Nan entries 
float_entries_center = center[center['summary'].apply(type) == float]
# Display the first few rows with float entries to inspect them
print("Nan Entries in Center articles:", float_entries_center.shape[0])

Nan Entries in Center articles: 0


In [46]:
# remove Nan entries 
float_entries_right = right[right['summary'].apply(type) == float]
# Display the first few rows with float entries to inspect them
print("Nan Entries in Right articles:", float_entries_right.shape[0])

Nan Entries in Right articles: 0


In [47]:
# remove Nan entries 
float_entries_left = left[left['summary'].apply(type) == float]
# Display the first few rows with float entries to inspect them
print("Nan Entries in Left articles:", float_entries_left.shape[0])

Nan Entries in Left articles: 0


In [48]:
# strip sentences from each data frame 
center['summary'] = center['summary'].apply(strip_sentences)
right['summary'] = right['summary'].apply(strip_sentences)
left['summary'] = left['summary'].apply(strip_sentences)

In [49]:
joined_df = pd.concat([center, left, right], ignore_index=True)


In [50]:
# Find entries in the 'summary' column that are of type float
float_entries = joined_df[joined_df['summary'].apply(type) == float]

# Display the first few rows with float entries to inspect them
print(float_entries.shape)

(0, 2)


In [51]:
#save moderate outlets clean data 
center.to_csv("model_data/NER_center.csv")
left.to_csv("model_data/NER_left.csv")
right.to_csv("model_data/NER_right.csv")
joined_df.to_csv("model_data/CNN_data.csv")