In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Load the CSV data into a DataFrame
df = pd.read_csv('/content/gdrive/MyDrive/MATH 509 Project/dataset/Data/News_Final.csv')

# Display the first few rows of the DataFrame to verify it's loaded correctly
print(df.head())

    IDLink                                              Title  \
0  99248.0   Obama Lays Wreath at Arlington National Cemetery   
1  10423.0        A Look at the Health of the Chinese Economy   
2  18828.0   Nouriel Roubini: Global Economy Not Back to 2008   
3  27788.0                          Finland GDP Expands In Q4   
4  27789.0  Tourism, govt spending buoys Thai economy in J...   

                                            Headline  \
0  Obama Lays Wreath at Arlington National Cemete...   
1  Tim Haywood, investment director business-unit...   
2  Nouriel Roubini, NYU professor and chairman at...   
3  Finland's economy expanded marginally in the t...   
4  Tourism and public spending continued to boost...   

                                     Source    Topic          PublishDate  \
0                                 USA TODAY    obama  2002-04-02 00:00:00   
1                                 Bloomberg  economy  2008-09-20 00:00:00   
2                                 Bloombe

In [4]:
#Handel missing data
# Check for missing values in each column
missing_data = df.isnull().sum()

# Print the number of missing values in each column
print(missing_data[missing_data > 0])
df = df.dropna()

Headline     15
Source      279
dtype: int64


####Text Preprocessing: Lemmatize the article headlines to bring words to their base or root form. Additionally, remove stopwords, punctuation, and perform lowercasing to standardize the text data for analysis.

In [23]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
#Preprocess the Text Data
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Stemmer
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(token) for token in lemmatized_text]

    # Re-join tokens into a string
    return ' '.join(stemmed_words)


In [25]:
# Apply the preprocessing function to each headline
df['Processed_Headline'] = df['Headline'].apply(preprocess_text)

# Display the processed headlines
print(df[['Headline', 'Processed_Headline']])

                                                Headline  \
0      Obama Lays Wreath at Arlington National Cemete...   
1      Tim Haywood, investment director business-unit...   
2      Nouriel Roubini, NYU professor and chairman at...   
3      Finland's economy expanded marginally in the t...   
4      Tourism and public spending continued to boost...   
...                                                  ...   
93234  The June employment report is viewed as a cruc...   
93235  In addition, establish stimulating economic po...   
93236  The Palestinian government spends nearly $140 ...   
93237  Palestine Youth Orchestra prepares for first U...   
93238  Goldstein, the proprietor of the TG Travel Gro...   

                                      Processed_Headline  
0      obama lay wreath arlington nation cemeteri pre...  
1      tim haywood invest director busi unit head fix...  
2      nouriel roubini nyu professor chairman roubini...  
3      finland economi expand margin three 

####Document-Term Matrix (DTM): Transform the preprocessed headlines into a DTM, which quantifies the headlines by the occurrence of words.

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [27]:
dtm = vectorizer.fit_transform(df['Processed_Headline'])

In [28]:
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

In [29]:
dtm_df

Unnamed: 0,00,000,000001,00001,000333,0025,003,0031,005930,00am,...,âtoday,äìhave,échappé,école,éducat,élus,était,été,événement,être
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#!pip install scikit-learn matplotlib




In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA().fit(dtm_df)
cumulative_variance_ratio = pca.explained_variance_ratio_.cumsum()

n_components = len(cumulative_variance_ratio[cumulative_variance_ratio <= 0.95]) + 1


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardizing the features
scaler = StandardScaler()
dtm_scaled = scaler.fit_transform(dtm_df)


In [None]:
# Initialize PCA
pca = PCA(n_components=#)

# Fit PCA on the DTM or standardized DTM
dtm_pca = pca.fit_transform(dtm_df)  # or dtm_scaled if you've standardized