## Importing the packages

In [1]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

  from pandas.core import (


## Loading the Data

In [2]:
directory = "D:/2_nlp"
# Define the file names
file_name_250_MB = "Last_25_years_250_MB_dataset.xlsx"

# Construct the full file paths
file_path_250_MB = os.path.join(directory, file_name_250_MB)

# Read the .xlsx files into DataFrames
last_25_years_250_MB_df = pd.read_excel(file_path_250_MB)

# Display the first few rows of the DataFrames
last_25_years_250_MB_df.head()

Unnamed: 0,article_id,newspaper_name,edition,date,page,headline,byline,article
0,1_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,"Dark Names Lockmon ,\njansen Giont Coaches",,SAN FRANCISCO. Nov. 10\n(AP).-Alvin Dark made ...
1,2_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,FALONEY FIGURES IN SWAP\n\n\nEtcheverry Deal R...,,head last Saturday when Mon\ntreal lost in q c...
2,3_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Home Ice Helps\nChicago to Gain\nLead in Leoque,BY the Associated Press,There's just no place like\nhome as far as the...
3,4_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Chamberlain\nFires In 4o\nAgainst LA,b the Associated rfess,Wilt Chamberlain keeps on\nscoring baskets and...
4,5_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,,,raped a home run the first\ntime he faced a ma...


In [3]:
# Ensure the necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Joel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Preparation

In [4]:
def prepare_data(df):
    """
    Prepares the data for topic modeling by concatenating headline and article,
    cleaning and preprocessing the text.
    
    Parameters:
    - df: DataFrame with the news articles and metadata.
    
    Returns:
    - DataFrame with an additional column 'prepared_text' for analysis.
    """
    
    # Fill NaN values with empty strings to avoid type errors during concatenation
    df['headline'] = df['headline'].fillna('')
    df['article'] = df['article'].fillna('')
    
    # Concatenate 'headline' and 'article'
    df['text_for_analysis'] = df['headline'] + " " + df['article']
    
    # Define a nested function for text preprocessing
    def preprocess_text(text):
        # Remove special characters and punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenization (splitting into words)
        words = text.split()
        # Remove stopwords and lemmatize
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return ' '.join(processed_words)
    
    # Apply the preprocessing function to the concatenated text
    df['prepared_text'] = df['text_for_analysis'].apply(preprocess_text)
    
    return df

prepared_last_25_years_df = prepare_data(last_25_years_250_MB_df)

In [5]:
prepared_last_25_years_df.head()

Unnamed: 0,article_id,newspaper_name,edition,date,page,headline,byline,article,text_for_analysis,prepared_text
0,1_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,"Dark Names Lockmon ,\njansen Giont Coaches",,SAN FRANCISCO. Nov. 10\n(AP).-Alvin Dark made ...,"Dark Names Lockmon ,\njansen Giont Coaches SAN...",dark name lockmon jansen giont coach san franc...
1,2_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,FALONEY FIGURES IN SWAP\n\n\nEtcheverry Deal R...,,head last Saturday when Mon\ntreal lost in q c...,FALONEY FIGURES IN SWAP\n\n\nEtcheverry Deal R...,faloney figure swap etcheverry deal rock conod...
2,3_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Home Ice Helps\nChicago to Gain\nLead in Leoque,BY the Associated Press,There's just no place like\nhome as far as the...,Home Ice Helps\nChicago to Gain\nLead in Leoqu...,home ice help chicago gain lead leoque there p...
3,4_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Chamberlain\nFires In 4o\nAgainst LA,b the Associated rfess,Wilt Chamberlain keeps on\nscoring baskets and...,Chamberlain\nFires In 4o\nAgainst LA Wilt Cham...,chamberlain fire 4o la wilt chamberlain keep s...
4,5_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,,,raped a home run the first\ntime he faced a ma...,raped a home run the first\ntime he faced a m...,raped home run first time faced major league p...


In [6]:
prepared_last_25_years_df['date'] = pd.to_datetime(prepared_last_25_years_df['date'])
prepared_last_25_years_df['year'] = prepared_last_25_years_df['date'].dt.year
prepared_last_25_years_df['month'] = prepared_last_25_years_df['date'].dt.month
prepared_last_25_years_df['day'] = prepared_last_25_years_df['date'].dt.day

for year in range(1960,1935,-1):
    print(prepared_last_25_years_df[prepared_last_25_years_df['year'] == year]['month'].nunique())

12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12


In [8]:
directory = "D:/2_nlp"
file_name = "Prepared_last_25_years_250_MB_dataset.xlsx"  # Adding the .xlsx extension

full_path = os.path.join(directory, file_name)

# Save the DataFrame as an .xlsx file
prepared_last_25_years_df.to_excel(full_path, index=False)

print(f"File saved successfully at {full_path}")

File saved successfully at D:/2_nlp\Prepared_last_25_years_250_MB_dataset.xlsx
