# Text analytics - Group Assignment

In [43]:
# Import libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

In [44]:
# Load the news headlines dataset
news_df = pd.read_csv(r'C:\Users\User\Desktop\Semester 7\text analysis\group assignment\amazon_news_info_new.csv')

news_df.head()

Unnamed: 0,title,updated_time
0,Microsoft offers cloud customers AMD alternati...,5/17/2024
1,UK's CMA rejects probe into Microsoft-Mistral ...,5/17/2024
2,Shoprite's Checkers extends on-demand delivery...,5/17/2024
3,CoreWeave raises $7.5 billion in debt deal led...,5/17/2024
4,TV companies flaunt ad tech and AI to persuade...,5/17/2024


# Data Preprocessing: Preprocess the dataset by cleaning and transforming the textual data.

In [45]:
#change updated time column to Date column and title column to headline column
news_df.rename(columns={'updated_time': 'Date', 'title': 'Headline'}, inplace=True)
news_df.head()

Unnamed: 0,Headline,Date
0,Microsoft offers cloud customers AMD alternati...,5/17/2024
1,UK's CMA rejects probe into Microsoft-Mistral ...,5/17/2024
2,Shoprite's Checkers extends on-demand delivery...,5/17/2024
3,CoreWeave raises $7.5 billion in debt deal led...,5/17/2024
4,TV companies flaunt ad tech and AI to persuade...,5/17/2024


# Text normalization (lowercasing, removing punctuation)

In [46]:
#lowecasing the data
news_df['Headline'] = news_df['Headline'].str.lower()
news_df.head()

Unnamed: 0,Headline,Date
0,microsoft offers cloud customers amd alternati...,5/17/2024
1,uk's cma rejects probe into microsoft-mistral ...,5/17/2024
2,shoprite's checkers extends on-demand delivery...,5/17/2024
3,coreweave raises $7.5 billion in debt deal led...,5/17/2024
4,tv companies flaunt ad tech and ai to persuade...,5/17/2024


In [47]:
#remove punctuation
import re  

news_df['Headline'] = news_df['Headline'].apply(lambda x: re.sub(r'[^\w\s\.\-]', '', x))

news_df.head()

Unnamed: 0,Headline,Date
0,microsoft offers cloud customers amd alternati...,5/17/2024
1,uks cma rejects probe into microsoft-mistral a...,5/17/2024
2,shoprites checkers extends on-demand delivery ...,5/17/2024
3,coreweave raises 7.5 billion in debt deal led ...,5/17/2024
4,tv companies flaunt ad tech and ai to persuade...,5/17/2024


# Tokenization

In [48]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
# Tokenize the text
news_df['Headline'] = news_df['Headline'].apply(word_tokenize)
news_df.head()

Unnamed: 0,Headline,Date
0,"[microsoft, offers, cloud, customers, amd, alt...",5/17/2024
1,"[uks, cma, rejects, probe, into, microsoft-mis...",5/17/2024
2,"[shoprites, checkers, extends, on-demand, deli...",5/17/2024
3,"[coreweave, raises, 7.5, billion, in, debt, de...",5/17/2024
4,"[tv, companies, flaunt, ad, tech, and, ai, to,...",5/17/2024


# Removing stopwords

In [50]:
from nltk.corpus import stopwords

# Ensure NLTK resources are available
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
# Define the stop words set
stop_words = set(stopwords.words('english'))

# Remove stop words
news_df['Headline'] = news_df['Headline'].apply(lambda x: [word for word in x if word not in stop_words])

news_df.head()

Unnamed: 0,Headline,Date
0,"[microsoft, offers, cloud, customers, amd, alt...",5/17/2024
1,"[uks, cma, rejects, probe, microsoft-mistral, ...",5/17/2024
2,"[shoprites, checkers, extends, on-demand, deli...",5/17/2024
3,"[coreweave, raises, 7.5, billion, debt, deal, ...",5/17/2024
4,"[tv, companies, flaunt, ad, tech, ai, persuade...",5/17/2024


# lemmatization

In [52]:
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are available
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [53]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
news_df['Headline'] = news_df['Headline'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
news_df

Unnamed: 0,Headline,Date
0,"[microsoft, offer, cloud, customer, amd, alter...",5/17/2024
1,"[uk, cma, reject, probe, microsoft-mistral, ai...",5/17/2024
2,"[shoprites, checker, extends, on-demand, deliv...",5/17/2024
3,"[coreweave, raise, 7.5, billion, debt, deal, l...",5/17/2024
4,"[tv, company, flaunt, ad, tech, ai, persuade, ...",5/17/2024
...,...,...
3982,"[exclusive, french, delivery, company, colis, ...",7/12/2021
3983,"[virus, variant, threaten, global, recovery, g...",4/29/2024
3984,"[twitter, appoints, grievance, officer, india,...",7/11/2021
3985,"[biden, sign, order, tackle, corporate, abuse,...",7/10/2021


# Handling missing data

In [54]:
news_df.isna().sum()


Headline    0
Date        0
dtype: int64

In [55]:
# Merge tokens into a single string
news_df['Headline'] = news_df['Headline'].apply(lambda tokens: ' '.join(tokens))

# Display the DataFrame with merged tokens
print(news_df)

                                               Headline       Date
0     microsoft offer cloud customer amd alternative...  5/17/2024
1       uk cma reject probe microsoft-mistral ai tie-up  5/17/2024
2     shoprites checker extends on-demand delivery g...  5/17/2024
3     coreweave raise 7.5 billion debt deal led blac...  5/17/2024
4     tv company flaunt ad tech ai persuade advertis...  5/17/2024
...                                                 ...        ...
3982  exclusive french delivery company colis privé ...  7/12/2021
3983   virus variant threaten global recovery g20 warns  4/29/2024
3984  twitter appoints grievance officer india compl...  7/11/2021
3985  biden sign order tackle corporate abuse across...  7/10/2021
3986  dubai ruler launch big tech company national p...  7/10/2021

[3987 rows x 2 columns]


In [72]:
# Convert the "Date" column to datetime format
news_df['Date'] = pd.to_datetime(news_df['Date'])

# Sort the DataFrame in ascending order based on the "Date" column
news_df_sorted = news_df.sort_values(by='Date')
print(news_df_sorted)

                                               Headline       Date
3986  dubai ruler launch big tech company national p... 2021-07-10
3985  biden sign order tackle corporate abuse across... 2021-07-10
3984  twitter appoints grievance officer india compl... 2021-07-11
3982  exclusive french delivery company colis privé ... 2021-07-12
3981  billionaire branson soar space aboard virgin g... 2021-07-12
...                                                 ...        ...
3     coreweave raise 7.5 billion debt deal led blac... 2024-05-17
1       uk cma reject probe microsoft-mistral ai tie-up 2024-05-17
5        space data fuel india farming innovation drive 2024-05-17
4     tv company flaunt ad tech ai persuade advertis... 2024-05-17
0     microsoft offer cloud customer amd alternative... 2024-05-17

[3987 rows x 2 columns]


# Merge news and stock price data

In [73]:
# import amazon stock price dataset
# Load the news headlines dataset
price_df = pd.read_csv(r'C:\Users\User\Downloads\AMZN.csv')

price_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,163.5,163.600006,157.201004,159.331497,159.331497,88228000
1,2021-01-05,158.300507,161.169006,158.253006,160.925507,160.925507,53110000
2,2021-01-06,157.324005,159.875504,156.557999,156.919006,156.919006,87896000
3,2021-01-07,157.850006,160.427002,157.75,158.108002,158.108002,70290000
4,2021-01-08,159.0,159.531998,157.110001,159.134995,159.134995,70754000


In [77]:
# Convert "Date" column in price_df to datetime64[ns]
price_df['Date'] = pd.to_datetime(price_df['Date'])

In [78]:
# Group news_df by Date and aggregate headlines into one row per date
news_grouped = news_df_sorted.groupby('Date')['Headline'].apply(lambda x: ' '.join(x)).reset_index()
print(news_grouped)

          Date                                           Headline
0   2021-07-10  dubai ruler launch big tech company national p...
1   2021-07-11  twitter appoints grievance officer india compl...
2   2021-07-12  exclusive french delivery company colis privé ...
3   2021-07-13  verizon business sign 5g contactless payment d...
4   2021-07-14  nike loses fight eu probe dutch tax deal amazo...
..         ...                                                ...
870 2024-05-13  choose france investment push bag record 16 bi...
871 2024-05-14  amazon cloud unit chief step three year invest...
872 2024-05-15  u canadian company kick 2024 layoff amazon web...
873 2024-05-16                  european regulator crack big tech
874 2024-05-17  shoprites checker extends on-demand delivery g...

[875 rows x 2 columns]


In [81]:
# Merge news_grouped and price_df based on the Date column
merged_df = pd.merge(price_df,news_grouped,  on="Date", how="left")


In [82]:
print(merged_df)

          Date        Open        High         Low       Close   Adj Close  \
0   2021-01-04  163.500000  163.600006  157.201004  159.331497  159.331497   
1   2021-01-05  158.300507  161.169006  158.253006  160.925507  160.925507   
2   2021-01-06  157.324005  159.875504  156.557999  156.919006  156.919006   
3   2021-01-07  157.850006  160.427002  157.750000  158.108002  158.108002   
4   2021-01-08  159.000000  159.531998  157.110001  159.134995  159.134995   
..         ...         ...         ...         ...         ...         ...   
844 2024-05-13  188.000000  188.309998  185.360001  186.570007  186.570007   
845 2024-05-14  183.820007  187.720001  183.449997  187.070007  187.070007   
846 2024-05-15  185.970001  186.720001  182.729996  185.990005  185.990005   
847 2024-05-16  185.600006  187.309998  183.460007  183.630005  183.630005   
848 2024-05-17  183.759995  185.300003  183.350006  184.699997  184.699997   

       Volume                                           Headlin

In [83]:
# Assuming df is your DataFrame containing the data
merged_df.dropna(subset=['Headline'], inplace=True)

In [84]:
print(merged_df)

          Date        Open        High         Low       Close   Adj Close  \
130 2021-07-12  187.199997  187.864502  184.839493  185.927505  185.927505   
131 2021-07-13  185.104996  188.654007  183.565994  183.867996  183.867996   
132 2021-07-14  185.442505  185.882996  183.041504  184.084000  184.084000   
133 2021-07-15  184.710007  184.770004  181.046005  181.559998  181.559998   
134 2021-07-16  181.665497  182.302994  178.522995  178.681503  178.681503   
..         ...         ...         ...         ...         ...         ...   
844 2024-05-13  188.000000  188.309998  185.360001  186.570007  186.570007   
845 2024-05-14  183.820007  187.720001  183.449997  187.070007  187.070007   
846 2024-05-15  185.970001  186.720001  182.729996  185.990005  185.990005   
847 2024-05-16  185.600006  187.309998  183.460007  183.630005  183.630005   
848 2024-05-17  183.759995  185.300003  183.350006  184.699997  184.699997   

       Volume                                           Headlin

In [85]:
merged_df.to_csv('merged_data.csv', index=False)


In [86]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: C:\Users\User
