In [104]:
# this is for PDF 1-10,14,15

import os
import fitz  # PyMuPDF
import pandas as pd
import re

def extract_date_and_text_from_article(article):
    # Extract the date using a regex pattern that matches the date format
    date_pattern = re.compile(r'\d{1,2} [A-Za-z]+ \d{4}')
    date_match = date_pattern.search(article)
    date = date_match.group(0) if date_match else "Date not found"

    # Extract the main text of the article
    # It starts after the copyright note and ends just before the "Document SCMCOM00..." pattern
    text_pattern = re.compile(r'\(c\) \d{4} (?:scmp\.com|South China Morning Post Publishers Limited, Hong Kong)\. All rights reserved\.(.*)', re.DOTALL)
    text_match = text_pattern.search(article)
    text = text_match.group(1).strip() if text_match else "Text not found"


    return date, text

def extract_data_from_pdf(pdf_path):
    # Open the PDF file and extract its text
    doc = fitz.open(pdf_path)
    pdf_text = "\n".join(page.get_text() for page in doc)

    # Split the PDF text into individual articles based on the "Document SCMCOM00..." pattern
    articles = re.split(r'Document SCMCOM\d+', pdf_text)

    # Extract the date and main text from each article
    data = []
    for article in articles:
        date, text = extract_date_and_text_from_article(article)
        data.append({"Date": date, "Text": text})

    return data

pdf_path = 'C:/Users/lekai/Downloads/15.pdf'

# Extract data from the specified PDF file
extracted_data = extract_data_from_pdf(pdf_path)



extracted_data=pd.DataFrame(extracted_data)


In [105]:
df_2=pd.concat([df_2,extracted_data],axis=0,ignore_index=True)
df_2.tail(30)

Unnamed: 0,Date,Text
848,29 March 2018,Anti-tumour drug researcher Ascentage Pharma h...
849,18 April 2014,Premier Li Keqiang's surprise announcement in ...
850,29 December 2017,In 2015 and 2016 investment banks in Hong Kong...
851,29 December 2017,The financial regulators of Hong Kong and Chin...
852,17 September 2017,A planned US$1.5 billion share offer in Hong K...
853,21 September 2018,Mainland food delivery service only one among ...
854,7 September 2016,The Securities and Futures Commission will be ...
855,5 June 2020,* NetEase prices its Hong Kong secondary listi...
856,28 November 2016,"Asia Pacific Investment Partners (APIP), the H..."
857,14 June 2018,"John Tsang Chun-wah, former Hong Kong financia..."


In [139]:
# drop columns with not found

df_2.drop(df_2[df_2['Date']=='Date not found'].index,axis=0,inplace=True)

In [140]:
#drop duplicates

df=df_2.drop_duplicates(subset=['Text'],keep='first')
 
df


Unnamed: 0,Date,Text
0,25 July 2018,Hong Kong's stock exchange operator has deferr...
1,30 August 2023,* Indebted property developers such as Evergra...
2,20 December 2017,"The Securities and Futures Commission, recogni..."
3,10 October 2018,Hong Kong's dual-class shares listing rules mu...
4,30 August 2014,Exchange is seeking views on whether listing r...
...,...,...
872,31 October 2020,Tech start-ups held by corporate entities may ...
873,24 June 2016,?utm_source=factiva&utm_medium=intranet&utm_ca...
874,17 March 2014,Stock exchange chief Charles Li Xiaojia says t...
875,28 January 2018,"On a sunny day earlier this month, egrets roam..."


In [141]:
mask=df.groupby('Date')['Date'].transform('count')>=2
mask

0      False
1      False
2       True
3      False
4      False
       ...  
872    False
873    False
874     True
875    False
876    False
Name: Date, Length: 868, dtype: bool

In [169]:
filtered_data= df[df['Text'].str.contains('dual Class Share|dual class|weighted voting|Class A|Class B|dual-class|voting right', case=False,na=False)]

In [181]:
filtered_data['length']=filtered_data['Text'].apply(lambda x:len(str(x).split()))
filtered_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['length']=filtered_data['Text'].apply(lambda x:len(str(x).split()))


Unnamed: 0,Date,Text,Processed_text,length
0,25 July 2018,Hong Kong's stock exchange operator has deferr...,Hong Kong 's stock exchange operator deferred ...,703
2,20 December 2017,"The Securities and Futures Commission, recogni...","Securities Futures Commission , recognising co...",958
3,10 October 2018,Hong Kong's dual-class shares listing rules mu...,Hong Kong 's dual-class share listing rule mus...,703
4,30 August 2014,Exchange is seeking views on whether listing r...,Exchange seeking view whether listing rule cha...,2658
5,9 June 2015,Hong Kong lost the Alibaba initial public offe...,Hong Kong lost Alibaba initial public offering...,1324
...,...,...,...,...
872,31 October 2020,Tech start-ups held by corporate entities may ...,Tech start-ups held corporate entity may allow...,2903
873,24 June 2016,?utm_source=factiva&utm_medium=intranet&utm_ca...,? utm_source=factiva & utm_medium=intranet & u...,903
874,17 March 2014,Stock exchange chief Charles Li Xiaojia says t...,Stock exchange chief Charles Li Xiaojia say ci...,503
875,28 January 2018,"On a sunny day earlier this month, egrets roam...","sunny day earlier month , egret roamed abandon...",2108


In [182]:
#start of nlp
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer= WordNetLemmatizer()
stop_words=set(stopwords.words('english'))
def preprocess_text(text):
    tokens=word_tokenize(text)
    lemmatized_tokens=[lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

filtered_data['Processed_text']= filtered_data['Text'].apply(preprocess_text)
filtered_data[['Text', 'Processed_text']].head()




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lekai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lekai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lekai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Processed_text']= filtered_data['Text'].apply(preprocess_text)


Unnamed: 0,Text,Processed_text
0,Hong Kong's stock exchange operator has deferr...,Hong Kong 's stock exchange operator deferred ...
2,"The Securities and Futures Commission, recogni...","Securities Futures Commission , recognising co..."
3,Hong Kong's dual-class shares listing rules mu...,Hong Kong 's dual-class share listing rule mus...
4,Exchange is seeking views on whether listing r...,Exchange seeking view whether listing rule cha...
5,Hong Kong lost the Alibaba initial public offe...,Hong Kong lost Alibaba initial public offering...


In [186]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()


X = vectorizer.fit_transform(filtered_data['Processed_text'])
X.shape
feature_names = vectorizer.get_feature_names()

#convert to matrix df
count_matrix= pd.DataFrame(X.toarray(),columns=feature_names)
count_matrix.head()


Unnamed: 0,00,000,0000,000003125,000025,000km,0011,002,004,007,...,zou,zte,zto,zuckerberg,zuo,zuraidah,zurich,zveglich,李克強,習近平
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [192]:
#remove the number columns and english/chinese names
common_english_names = [
    'james', 'john', 'robert', 'michael', 'william', 'david', 'richard', 'joseph', 'charles', 'thomas', 'mary', 
    'patricia', 'jennifer', 'linda', 'elizabeth', 'barbara', 'susan', 'jessica', 'sarah', 'karen','Liang','Zhang','Zhao','Mei','Su','Tian','Yip','Wang','Fang','Ying', 'Li','Chen','Wei','Na','Liu','Yan','Mok' 
]

# Drop columns matching common English and Chinese names
count_matrix = count_matrix.drop(columns=count_matrix.columns[count_matrix.columns.isin(common_english_names)], errors='ignore')

count_matrix = count_matrix.loc[:, ~count_matrix.columns.str.startswith(tuple('0123456789'))]

In [194]:
count_matrix = count_matrix.loc[:, ~count_matrix.columns.str.contains('\d',na=False)]

In [215]:
filtered_data= filtered_data.reset_index(drop=True)
filtered_data

Unnamed: 0,Date,Text,Processed_text,length
0,25 July 2018,Hong Kong's stock exchange operator has deferr...,Hong Kong 's stock exchange operator deferred ...,703
1,20 December 2017,"The Securities and Futures Commission, recogni...","Securities Futures Commission , recognising co...",958
2,10 October 2018,Hong Kong's dual-class shares listing rules mu...,Hong Kong 's dual-class share listing rule mus...,703
3,30 August 2014,Exchange is seeking views on whether listing r...,Exchange seeking view whether listing rule cha...,2658
4,9 June 2015,Hong Kong lost the Alibaba initial public offe...,Hong Kong lost Alibaba initial public offering...,1324
...,...,...,...,...
301,31 October 2020,Tech start-ups held by corporate entities may ...,Tech start-ups held corporate entity may allow...,2903
302,24 June 2016,?utm_source=factiva&utm_medium=intranet&utm_ca...,? utm_source=factiva & utm_medium=intranet & u...,903
303,17 March 2014,Stock exchange chief Charles Li Xiaojia says t...,Stock exchange chief Charles Li Xiaojia say ci...,503
304,28 January 2018,"On a sunny day earlier this month, egrets roam...","sunny day earlier month , egret roamed abandon...",2108


In [217]:
v=pd.concat([filtered_data,count_matrix],axis=1,ignore_index=False)

In [219]:
v.dropna()

Unnamed: 0,Date,Text,Processed_text,length,abacus,abating,abbreviation,ability,able,abnormal,...,zoom,zou,zte,zto,zuckerberg,zuo,zurich,zveglich,李克強,習近平
0,25 July 2018,Hong Kong's stock exchange operator has deferr...,Hong Kong 's stock exchange operator deferred ...,703,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,20 December 2017,"The Securities and Futures Commission, recogni...","Securities Futures Commission , recognising co...",958,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,10 October 2018,Hong Kong's dual-class shares listing rules mu...,Hong Kong 's dual-class share listing rule mus...,703,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,30 August 2014,Exchange is seeking views on whether listing r...,Exchange seeking view whether listing rule cha...,2658,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,9 June 2015,Hong Kong lost the Alibaba initial public offe...,Hong Kong lost Alibaba initial public offering...,1324,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,31 October 2020,Tech start-ups held by corporate entities may ...,Tech start-ups held corporate entity may allow...,2903,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
302,24 June 2016,?utm_source=factiva&utm_medium=intranet&utm_ca...,? utm_source=factiva & utm_medium=intranet & u...,903,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
303,17 March 2014,Stock exchange chief Charles Li Xiaojia says t...,Stock exchange chief Charles Li Xiaojia say ci...,503,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
304,28 January 2018,"On a sunny day earlier this month, egrets roam...","sunny day earlier month , egret roamed abandon...",2108,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
from datetime import datetime
def convertdate(date):
    dt = datetime.strptime(date, '%d %B %Y')
    return dt.yearv.to_excel('C:/Users/lekai/Downloads/finaldoc.xlsx')
v['Year']=v['Date'].apply(lambda x:convertdate(x))

In [234]:
v.to_excel('C:/Users/lekai/Downloads/finaldoc.xlsx')