# Web Scraping


In [103]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import spacy
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import font_manager

In [104]:
# Initialize spaCy
try:
    nlp = spacy.load("zh_core_web_sm")
except OSError:
    print("Downloading Chinese language model...")
    spacy.cli.download("zh_core_web_sm")
    nlp = spacy.load("zh_core_web_sm")

# Define regions and base configuration
regions = ["北京", "上海", "广东", "浙江", "江苏", "天津", "重庆", "四川", "山东", "河南", "河北", 
           "湖南", "湖北", "陕西", "辽宁", "安徽", "福建", "江西", "黑龙江", "吉林", "广西", "云南", 
           "贵州", "甘肃", "内蒙古", "新疆", "西藏", "青海", "宁夏", "海南","香港","澳门", "台湾"]


In [105]:
base_url = 'https://cn.govopendata.com/xinwenlianbo/'
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

In [106]:
def scrape_news(formatted_date):
    news_items = []
    url = base_url+formatted_date+"/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
        
    news_segments = soup.find_all('h2')
    for segment in news_segments:
        title = segment.text.strip()
        content = segment.find_next_sibling('p').text.strip() if segment.find_next_sibling('p') else ""        
       
        # Extract region
        found_region = "其他"  # Default if no region found
        for region in regions:
            if region in content:
                found_region = region
                break
                
        news_items.append({
            'Date': formatted_date, 
            'Title': title, 
            'Content': content,
            'Region': found_region
        })
    return news_items

In [107]:
news = []
current_date = start_date
while current_date <= end_date:
    formatted_date = current_date.strftime('%Y%m%d')
    news = news + scrape_news(formatted_date)
    current_date += timedelta(days=1)

In [108]:
df_news = pd.DataFrame(news)

# Export to Excel
excel_path = 'china_nightly_news_2022.xlsx'
df_news.to_excel(excel_path, index=False)

# Data Calculation & Analysis

In [109]:
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import font_manager

In [110]:
ten_sectors_names = ["信息技术", "机器人", "航天", "海洋工程", "轨道交通", "新能源汽车", "电力", "新材料", "生物医药","农业机械"]

In [111]:
word_frequencies = defaultdict(lambda: defaultdict(int))

for index, row in df_news.iterrows():
    content = row['Title'] + " " + row['Content']
    month = int(row['Date'][4:6])
    for word in ten_sectors_names:
        if word in content:
            word_frequencies[word][month] += 1

data_for_df = []

for word, dates in word_frequencies.items():
    for date, frequency in dates.items():
        data_for_df.append({'Word': word, 'Month': date, 'Frequency': frequency})

freq_df = pd.DataFrame(data_for_df)

In [112]:
table_df = freq_df.pivot_table(index='Word', columns='Month', values='Frequency', aggfunc='sum', fill_value=0)
table_df.reset_index(inplace=True)

table_df

Month,Word,1,2,3,4,5,6,7,8,9,10,11,12
0,信息技术,14,3,2,5,4,3,5,4,1,2,4,4
1,农业机械,0,0,1,0,0,0,2,2,1,3,0,0
2,新材料,4,5,5,4,2,7,9,7,5,3,2,2
3,新能源汽车,3,3,3,6,6,6,8,8,4,1,7,6
4,机器人,12,4,2,2,1,3,3,5,2,1,2,2
5,生物医药,4,4,1,3,6,10,5,6,0,0,1,0
6,电力,18,4,11,12,12,10,15,16,16,5,8,9
7,航天,14,12,14,14,16,14,18,9,14,15,30,14
8,轨道交通,1,2,0,1,6,3,2,2,2,1,2,1


In [113]:
# Export monthly frequency
excel_path = 'word_frequency_2022.xlsx'
table_df.to_excel(excel_path, index=False)

In [114]:
# Create three-dimensional frequency analysis
word_region_month_frequencies = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for index, row in df_news.iterrows():
    content = row['Title'] + " " + row['Content']
    region = row['Region']
    month = int(row['Date'][4:6])
    
    for word in ten_sectors_names:
        if word in content:
            word_region_month_frequencies[word][region][month] += 1

# Create DataFrame with all dimensions
detailed_data = []
for word, regions in word_region_month_frequencies.items():
    for region, months in regions.items():
        row_data = {
            'Industry': word,
            'Region': region
        }
        # Add months 1-12
        for month in range(1, 13):
            row_data[f'Month_{month}'] = months.get(month, 0)
        detailed_data.append(row_data)

# Create final DataFrame
detailed_df = pd.DataFrame(detailed_data)

# Rename columns for clarity
month_columns = {f'Month_{i}': f'{i}' for i in range(1, 13)}
detailed_df.rename(columns=month_columns, inplace=True)

# Sort by Industry and Region
detailed_df = detailed_df.sort_values(['Industry', 'Region'])

# Export to Excel
excel_path = 'industry_region_monthly_frequency_2022.xlsx'
detailed_df.to_excel(excel_path, index=False)

# Display first few rows
print(detailed_df.head())

    Industry Region  1  2  3  4  5  6  7  8  9  10  11  12
100     信息技术     上海  0  0  0  0  0  1  0  0  0   1   1   0
89      信息技术     其他  7  0  1  1  1  1  1  1  1   0   2   0
92      信息技术     北京  2  2  0  1  1  0  0  0  0   0   1   1
101     信息技术     天津  0  0  0  0  0  1  0  0  0   0   0   0
104     信息技术     安徽  0  0  0  0  0  0  1  0  0   0   0   0
