In [26]:
#导入数据处理所需的Python库
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
#导入数据
RawData = pd.read_csv('COVID.csv')
pd.set_option('max_row',400)

In [27]:
#检查数据缺失情况
missing = RawData.isnull().sum().reset_index().rename(columns= {0:'missNum'})
pd.DataFrame(missing)

Unnamed: 0,index,missNum
0,Id,0
1,Sex,0
2,Age,0
3,AgeGroup,0
4,Education,0
5,FinancialSituation_General,0
6,FinancialSituation_Pandemic,0
7,IncomeContinuity,391
8,HealthStatus,0
9,Unemployed,0


In [28]:
nanline = RawData.isnull().any(1)
pd.DataFrame(RawData[nanline]['Unemployed'].describe().rename(index='IncomeContinuity_Umemployed')).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
IncomeContinuity_Umemployed,391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#处理心理健康量表数据(PHQ9,GAD7)，并按PHQ-ADS标准分级
RawData.drop(columns=['Phq9_1','Phq9_2','Phq9_3','Phq9_4','Phq9_5','Phq9_6','Phq9_7','Phq9_8','Phq9_9'],inplace=True)
RawData.drop(columns=['Gad7_1','Gad7_2','Gad7_3','Gad7_4','Gad7_5','Gad7_6','Gad7_7'],inplace=True)
RawData['Score_sum']=RawData['PHQ9_sum']+RawData['GAD7_sum']
def scoreclassification(score):
    if score <= 20:
        return '0'
    elif score <= 40:
        return '1'
#    elif score <= 30:
#        return '2'
    else:
        return '2'
RawData['Psy_Level']=RawData['Score_sum'].apply(lambda x:scoreclassification(x))
RawData.drop(columns=['PHQ9_sum','GAD7_sum','Score_sum'],inplace=True)
#处理社会支持量表和COVID-19感知风险量表数据
#Covid19 risk scale
# (i) COVID-19 infection;
# (ii) serious adverse health effects and complications due to a coronavirus infection
# (iii) life-threat as a result of the infection.
RawData['SocialSupportmean']=(RawData['SocialSupport_1']+RawData['SocialSupport_2']+RawData['SocialSupport_3']+RawData['SocialSupport_4']+RawData['SocialSupport_5'])/5
RawData['Covid19riskmean']=(RawData['Covid19_risk_1']+RawData['Covid19_risk_2']+RawData['Covid19_risk_3']+RawData['Covid19_risk_4']+RawData['Covid19_risk_5']+RawData['Covid19_risk_6'])/6
RawData.drop(columns=['SocialSupport_1','SocialSupport_2','SocialSupport_3','SocialSupport_4','SocialSupport_5','Covid19_risk_1','Covid19_risk_2','Covid19_risk_3','Covid19_risk_4','Covid19_risk_5','Covid19_risk_6'],inplace=True)
#计算疫情前后经济状况变化
RawData['FinancialSituation_Change']=RawData['FinancialSituation_Pandemic']-RawData['FinancialSituation_General']
RawData.drop(columns=['FinancialSituation_General','FinancialSituation_Pandemic'],inplace=True)
#处理疫情困难量表数据
#居家困扰[1-5,8]
RawData['Home_Trouble']=(RawData['Pandemic_Difficulties_1']+RawData['Pandemic_Difficulties_2']+RawData['Pandemic_Difficulties_3']+RawData['Pandemic_Difficulties_4']+RawData['Pandemic_Difficulties_5']+RawData['Pandemic_Difficulties_8'])/6
#社交限制[6,7,9]
RawData['Social_Restrictions']=(RawData['Pandemic_Difficulties_6']+RawData['Pandemic_Difficulties_7']+RawData['Pandemic_Difficulties_9'])/3
#防护措施和生活改变[10-13]
RawData['Protection_and_Life_Change']=(RawData['Pandemic_Difficulties_10']+RawData['Pandemic_Difficulties_11']+RawData['Pandemic_Difficulties_12']+RawData['Pandemic_Difficulties_13'])/4
#内心疑虑[14-16]
RawData['Worry_and_Fear']=(RawData['Pandemic_Difficulties_14']+RawData['Pandemic_Difficulties_15']+RawData['Pandemic_Difficulties_16'])/3
RawData.drop(columns=['Pandemic_Difficulties_1','Pandemic_Difficulties_2','Pandemic_Difficulties_3','Pandemic_Difficulties_4','Pandemic_Difficulties_5','Pandemic_Difficulties_6','Pandemic_Difficulties_7','Pandemic_Difficulties_8','Pandemic_Difficulties_9','Pandemic_Difficulties_10','Pandemic_Difficulties_11','Pandemic_Difficulties_12','Pandemic_Difficulties_13','Pandemic_Difficulties_14','Pandemic_Difficulties_15','Pandemic_Difficulties_16'],inplace=True)
#去除多余变量
RawData.drop(columns=['Id','Age','SocialSupport_mean','PercievedThreat_mean','Fear_uncertainty','External_restrictions','Difficulties_in_relationships_and_at_home','Lock_of_social_contacts'],inplace=True)

In [30]:
#修复IncomeContinuity变量的缺失值，填充为1
RawData.fillna(1,inplace=True)
RawData['IncomeContinuity'] = np.int_(RawData.IncomeContinuity)
#更正变量数据类型
RawData[['Sex','AgeGroup','Education','IncomeContinuity','HealthStatus','Unemployed','Student','Psy_Level']] = RawData[['Sex','AgeGroup','Education','IncomeContinuity','HealthStatus','Unemployed','Student','Psy_Level']].astype('category')
#调整列顺序方便后续分析
RawData = RawData[['Psy_Level','Sex','AgeGroup','Education','IncomeContinuity','HealthStatus','Unemployed','Student','SocialSupportmean','Covid19riskmean','FinancialSituation_Change','Home_Trouble','Social_Restrictions','Protection_and_Life_Change','Worry_and_Fear']]
RawData.head()
RawData.to_csv('D:/Python/PythonFile/HCC_Paper/Data.csv',index=False)


Unnamed: 0,Psy_Level,Sex,AgeGroup,Education,IncomeContinuity,HealthStatus,Unemployed,Student,SocialSupportmean,Covid19riskmean,FinancialSituation_Change,Home_Trouble,Social_Restrictions,Protection_and_Life_Change,Worry_and_Fear
0,0,0,2,3,0,3,1,0,4.0,2.0,-3,1.833333,1.333333,1.75,1.333333
1,1,1,1,6,1,3,0,0,3.6,3.0,-2,2.5,2.666667,2.75,2.333333
2,0,0,3,3,1,2,0,0,3.6,3.0,-1,2.166667,2.0,2.75,2.0
3,1,1,2,6,0,2,0,0,2.4,2.166667,1,2.0,2.333333,2.5,2.0
4,0,1,3,3,0,2,1,0,3.4,3.0,-1,2.666667,2.333333,2.0,2.0
