In [1]:
import pandas as pd
import numpy as np
import json
# 这里假设patents_data包含了专利ID和发表年份的信息

# 定义CAGR计算函数
def calculate_cagr(end_value, start_value, period):
    if start_value == 0:  # 避免除以零的情况
        return None
    return ((end_value / start_value) ** (1 / period) - 1) * 100

In [8]:
with open('D:/Py_and_ML/28/communities/agriculture_communities.json', 'r', encoding='utf-8') as file:
    chosen_community = json.load(file)
patents_data = pd.read_csv('D:/Py_and_ML/28/selected_patents/agriculture_citation.csv')

cagr_results = []

for community_id, patent_ids in chosen_community.items():
    community_data = patents_data[patents_data['citation_patent_id'].isin(patent_ids)]
    unique_years = sorted(community_data['Year'].unique())

    for start_year in unique_years:
        for period in [5, 10]:
            end_year = start_year + period
            if end_year in unique_years:
                start_value = community_data[community_data['Year'] == start_year].shape[0]
                end_value = community_data[community_data['Year'] == end_year].shape[0]
                cagr = calculate_cagr(end_value, start_value, period)
                if cagr is not None:  # 只有当CAGR可以被计算时才添加
                    result_key = f'CAGR_{period}'
                    if not any(d['CommunityID'] == community_id and d['Year'] == start_year for d in cagr_results):
                        cagr_results.append({
                            'CommunityID': community_id,
                            'Year': start_year,
                            result_key: cagr
                        })
                    else:
                        # 如果该社区和年份的记录已存在，则更新记录
                        for d in cagr_results:
                            if d['CommunityID'] == community_id and d['Year'] == start_year:
                                d[result_key] = cagr
                                break

# 转换为DataFrame
cagr_df = pd.DataFrame(cagr_results)




In [ ]:
from collections import defaultdict

# 将专利按社区组织
community_patents = defaultdict(list)
for community_id, patent_list in communities.items():
    for patent_id in patent_list:
        community_patents[community_id].append(patent_id)

# 计算每个社区的平均相对专利年龄
average_patent_age = {}
max_year = citations['Year'].max()
for community_id, patent_ids in community_patents.items():
    patent_years = citations[citations['patent_id'].isin(patent_ids)]['Year']
    if not patent_years.empty:
        average_age = max_year - patent_years.mean()
        average_patent_age[community_id] = average_age

# 计算社区内部引用比例
internal_citation_ratios = {}
for community_id, patent_ids in community_patents.items():
    internal_citations = citations[(citations['patent_id'].isin(patent_ids)) & (citations['citation_patent_id'].isin(patent_ids))]
    total_citations = citations[citations['patent_id'].isin(patent_ids)]
    if not total_citations.empty:
        ratio = len(internal_citations) / len(total_citations)
        internal_citation_ratios[community_id] = ratio

# 准备汇总DataFrame
summary_df = pd.DataFrame({
    "CommunityID": list(community_patents.keys()),
    "AveragePatentAge": [average_patent_age.get(cid, None) for cid in community_patents.keys()],
    "InternalCitationRatio": [internal_citation_ratios.get(cid, None) for cid in community_patents.keys()]
})

summary_df.head()


In [9]:
# 保存为CSV
cagr_df.to_csv('D:/Py_and_ML/28/calculation/agriculture.csv', index=False)

# Regression

In [ ]:
# 导入必要的库
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# 生成模拟数据
X, y = make_regression(n_samples=100, n_features=1, noise=4.0, random_state=42)

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 创建SVR模型 - 这里使用径向基函数(RBF)核
svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)

# 训练模型
svr.fit(X_train_scaled, y_train)

# 预测测试集
y_pred = svr.predict(X_test_scaled)

# 计算并打印均方误差
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# 可选：进行更多的模型评估、调参等
