In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the CSV data into a DataFrame
df = pd.read_csv("/Users/kynanami/Documents/Winter 2024/MATH 509/dataset/Data/News_Final.csv", header=0)

# Display the first few rows of the DataFrame to verify it's loaded correctly
print(df.head())

    IDLink                                              Title  \
0  99248.0   Obama Lays Wreath at Arlington National Cemetery   
1  10423.0        A Look at the Health of the Chinese Economy   
2  18828.0   Nouriel Roubini: Global Economy Not Back to 2008   
3  27788.0                          Finland GDP Expands In Q4   
4  27789.0  Tourism, govt spending buoys Thai economy in J...   

                                            Headline  \
0  Obama Lays Wreath at Arlington National Cemete...   
1  Tim Haywood, investment director business-unit...   
2  Nouriel Roubini, NYU professor and chairman at...   
3  Finland's economy expanded marginally in the t...   
4  Tourism and public spending continued to boost...   

                                     Source    Topic          PublishDate  \
0                                 USA TODAY    obama  2002-04-02 00:00:00   
1                                 Bloomberg  economy  2008-09-20 00:00:00   
2                                 Bloombe

In [4]:
#Handel missing data
# Check for missing values in each column
missing_data = df.isnull().sum()

# Print the number of missing values in each column
print(missing_data[missing_data > 0])

#clean data
df = df.dropna()
df.shape

Headline     15
Source      279
dtype: int64


(92945, 11)

In [5]:
from langdetect import detect

In [9]:
# Removing duplicates from the "Headline" column.
df = df.drop_duplicates(subset=['Headline'])
df.shape

(86480, 12)

In [10]:
# Removing purely numeric values from the "Headline" column.
df = df[~df['Headline'].str.isnumeric()]

# Pattern to match the specific numeric formats in headlines
pattern = r'\d+\.[A-Z]{2,3}&gt;|\b\d+\.\d+\b|\d+am|\d+pm|\d+s|\d{2,}'


# Using str.replace to remove the matched patterns with an empty string, but only modify the 'Headline' column
df['Headline'] = df['Headline'].str.replace(pattern, '', regex=True)

# Save the filtered DataFrame to a CSV file
df.to_csv('no_numeric_headlines.csv', index=False)

# Display the shape of the DataFrame to confirm no columns were removed
print(df.shape)

(86480, 12)


In [11]:
import unicodedata

# Function to remove accents
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Apply the function to the 'Headline' column
df['Clean_Headline'] = df['Headline'].apply(remove_accents)

# Display the original and cleaned headlines
print(df[['Headline', 'Clean_Headline']])

                                                Headline  \
0      Obama Lays Wreath at Arlington National Cemete...   
1      Tim Haywood, investment director business-unit...   
2      Nouriel Roubini, NYU professor and chairman at...   
3      Finland's economy expanded marginally in the t...   
4      Tourism and public spending continued to boost...   
...                                                  ...   
93234  The June employment report is viewed as a cruc...   
93235  In addition, establish stimulating economic po...   
93236  The Palestinian government spends nearly $ mil...   
93237  Palestine Youth Orchestra prepares for first U...   
93238  Goldstein, the proprietor of the TG Travel Gro...   

                                          Clean_Headline  
0      Obama Lays Wreath at Arlington National Cemete...  
1      Tim Haywood, investment director business-unit...  
2      Nouriel Roubini, NYU professor and chairman at...  
3      Finland's economy expanded marginall

In [12]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

# Function to remove unrecognized words
def remove_typos(text):
    # Split the text into words
    words = text.split()
    # Check each word against the spell checker
    valid_words = [word for word in words if word in spell or len(spell.unknown([word])) == 0]
    # Join and return the valid words as a new string
    return ' '.join(valid_words)

# Apply the function to the 'Headline' column
df['Cleaned_Headline'] = df['Clean_Headline'].apply(remove_typos)

# Display the DataFrame to verify changes
print(df[['Clean_Headline', 'Cleaned_Headline']])

                                          Clean_Headline  \
0      Obama Lays Wreath at Arlington National Cemete...   
1      Tim Haywood, investment director business-unit...   
2      Nouriel Roubini, NYU professor and chairman at...   
3      Finland's economy expanded marginally in the t...   
4      Tourism and public spending continued to boost...   
...                                                  ...   
93234  The June employment report is viewed as a cruc...   
93235  In addition, establish stimulating economic po...   
93236  The Palestinian government spends nearly $ mil...   
93237  Palestine Youth Orchestra prepares for first U...   
93238  Goldstein, the proprietor of the TG Travel Gro...   

                                        Cleaned_Headline  
0      Lays Wreath at National President has laid a w...  
1      investment director head for fixed income at d...  
2      professor and chairman at Global explains why ...  
3      economy expanded marginally in the t

In [13]:
import nltk
nltk.download('words')
from nltk.corpus import words

word_list = set(words.words())

def remove_non_dictionary_words(text):
    return ' '.join(word for word in text.split() if word in word_list)

# Assuming 'text' is a string containing your text data
df['Cleaned_Headline'] = df['Cleaned_Headline'].apply(remove_non_dictionary_words)

[nltk_data] Downloading package words to /Users/kynanami/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kynanami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect_langs
from langdetect import detect_langs, LangDetectException

# Detecting the language of each headline
def detect_language_safe(text):
    try:
        return detect_langs(text)
    except LangDetectException:
        return ['unknown']

sample_headlines = df['Cleaned_Headline'].astype(str)
detected_languages = sample_headlines.apply(lambda x: detect_language_safe(x))
detected_languages

0        [en:0.9999992118190077]
1        [en:0.9999979793666663]
2        [en:0.9999974964674774]
3        [en:0.9999989029837117]
4        [en:0.9999965625426681]
                  ...           
93234    [en:0.9999974227318642]
93235    [en:0.9999951959686906]
93236    [en:0.9999977005383116]
93237    [en:0.9999970397597799]
93238    [en:0.9999985071438429]
Name: Cleaned_Headline, Length: 86480, dtype: object

In [16]:
def detect_language(text):
    try:
        languages = detect_langs(text)
        return languages[0].lang
    except LangDetectException:
        return None

# Detect language in batches
batch_size = 1000
languages = []

for i in range(0, df.shape[0], batch_size):
    batch = df['Cleaned_Headline'][i:i+batch_size].astype(str)
    batch_languages = batch.apply(detect_language)
    languages.extend(batch_languages)

# Adding detected languages as a new column to the DataFrame
df['Language'] = languages

# Filtering out non-English headlines
df_filtered = df[df['Language'] == 'en']


print(f"Original number of headlines: {df.shape[0]}")
print(f"Number of headlines with only English Language headlines: {df_filtered.shape[0]}")



Original number of headlines: 86480
Number of headlines with only English Language headlines: 85839


In [17]:
# Save the filtered DataFrame to a CSV file
df_filtered.to_csv('filtered_headlines.csv', index=False)

In [18]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kynanami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
#Preprocess the Text Data
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s\+\¬\†]', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Stemmer
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(token) for token in lemmatized_text]

    # Re-join tokens into a string
    return ' '.join(stemmed_words)

In [20]:
# Apply the preprocessing function to each headline
df_filtered['Processed_Headline'] = df_filtered['Cleaned_Headline'].apply(preprocess_text)

# Display the processed headlines
print(df_filtered[['Headline', 'Processed_Headline']])

                                                Headline  \
0      Obama Lays Wreath at Arlington National Cemete...   
1      Tim Haywood, investment director business-unit...   
2      Nouriel Roubini, NYU professor and chairman at...   
3      Finland's economy expanded marginally in the t...   
4      Tourism and public spending continued to boost...   
...                                                  ...   
93234  The June employment report is viewed as a cruc...   
93235  In addition, establish stimulating economic po...   
93236  The Palestinian government spends nearly $ mil...   
93237  Palestine Youth Orchestra prepares for first U...   
93238  Goldstein, the proprietor of the TG Travel Gro...   

                                      Processed_Headline  
0                                      laid wreath honor  
1         invest director head fix incom beig book state  
2                 professor chairman global economi face  
3      economi expand margin three end prev

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Processed_Headline'] = df_filtered['Cleaned_Headline'].apply(preprocess_text)


### Document-Term Matrix (DTM): Transform the preprocessed headlines into a DTM, which quantifies the headlines by the occurrence of words.

In [21]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kynanami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kynanami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
#
dtm = vectorizer.fit_transform(df_filtered['Processed_Headline'])


dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
# dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())

dtm_df

Unnamed: 0,aback,abandon,abat,abbey,abhor,abid,abil,abject,abl,ablaz,...,zeal,zero,zigzag,zinc,zing,zip,zloti,zombi,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=4)
ldamodel = lda.fit(dtm)

In [31]:
tf_feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(ldamodel, tf_feature_names, no_top_words)

Topic 0:
economi econom said growth global percent govern quarter year grow
Topic 1:
first presid visit administr meet secur state peopl last time
Topic 2:
new compani today would work said use one make get
Topic 3:
economi econom state said presidenti two one would market last


In [38]:
topic_dist = ldamodel.transform(dtm)

85839

In [39]:
classified_topics = [None]*len(topic_dist)
for i in range(0, len(topic_dist)-1):
    classified_topics[i] = pd.Series(topic_dist[i]).idxmax()

In [41]:
print(classified_topics.count(0), classified_topics.count(1), classified_topics.count(2), classified_topics.count(3))

23290 19686 25139 17723


### PCA

In [22]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
from scipy import sparse
dtm_sparse = sparse.csr_matrix(dtm_df)

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardizing the features
scaler = StandardScaler(with_mean=False)
dtm_scaled = scaler.fit_transform(dtm_sparse)

In [23]:
import numpy as np
index = np.arange(85919)
batches = np.split(np.arange(90000), 9)
batches[-1] = np.arange(80000, 85907)

In [27]:
from sklearn.decomposition import IncrementalPCA
from sklearn import preprocessing

# Initialize IncrementalPCA with default parameters
ipca = IncrementalPCA()

# Assuming you have a batches function defined properly
for batch in batches:
  dtm_batch = dtm_df.iloc[batch]
  dtm_batch = preprocessing.scale(dtm_batch)
  ipca.partial_fit(dtm_batch)

In [38]:
exp_var = ipca.explained_variance_ratio_.cumsum()
len(exp_var)

9605

In [1]:
import matplotlib.pyplot as plt
plt.plot(exp_var)

NameError: name 'exp_var' is not defined

In [50]:
threshold = 0.8
pc_reduced = ipca.components_[:,:np.argmax(exp_var>threshold)]
pc_reduced.shape
np.savetxt("principal_components.csv", pc_reduced, delimiter=",")

In [None]:
# #Perform PCA directly

# pca = PCA().fit(dtm_df)
# cumulative_variance_ratio = pca.explained_variance_ratio_.cumsum()

# # Find the # of components that account for a 95% cumulative variance
# n_components = len(cumulative_variance_ratio[cumulative_variance_ratio <= 0.95]) + 1

# n_components

In [None]:
# # Initialize PCA, choose the number of components
# pca = PCA(n_components=#)

# # Fit PCA on the DTM or standardized DTM
# dtm_pca = pca.fit_transform(dtm_df)

# dtm_pca

In [None]:
# # Randomized PCA
# from sklearn.decomposition import PCA

# pca = PCA(n_components=100, svd_solver='randomized')

# dtm_pca = pca.fit(dtm_df)

# dtm_pca