In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.stats import linregress
import datetime

In [2]:
def round_to_nearest(value):
    
    difference_ceil = math.ceil(value) - value
    if difference_ceil <= 0.5:
        return math.floor(value)
    else:
        return math.ceil(value)
    
def calculate_age_at_visit(row):
    date_of_birth = row['Data di nascita']
    date_of_visit = row['Date_of_visit']
    age = date_of_visit.year - date_of_birth.year - ((date_of_visit.month, date_of_visit.day) < (date_of_birth.month, date_of_birth.day))
    return age

def count_relapses(patient_id, date):
    
    relapses = Relapse_number[(Relapse_number['Paziente ID'] == patient_id)
                            & (Relapse_number['relapse'] <= date)]
    
    if len(relapses) < 1:
        return 0
    else:
        return relapses.iloc[-1]['Relapse Count']

In [3]:
# Necessary Datasets
FS_Scores_data = pd.read_csv('.../imputed_data_EWMA.csv')
Airquality_data = pd.read_csv(".../necessary_data.csv")
Age = pd.read_csv(".../Patient_CAP.csv")
Relapse_number = pd.read_csv(".../relapse.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# Doing some preprocessing for both data
# Remove duplicate rows in clinical data
FS_Scores_data.drop_duplicates(subset=["Patient_ID", "Date_of_visit"],
                          keep = "first",
                          inplace=True, 
                          ignore_index=True)

# Convert object features to date feature preparing for merge
Airquality_data['start_date'] = pd.to_datetime(Airquality_data['start_date'], format='%d/%m/%Y')
Airquality_data['end_date'] = pd.to_datetime(Airquality_data['end_date'], format='%d/%m/%Y')
Airquality_data['patient_id']=Airquality_data['patient_id'].astype(object)

FS_Scores_data['Date_of_visit'] = pd.to_datetime(FS_Scores_data['Date_of_visit'], format ='%d/%m/%Y')
FS_Scores_data['Patient_ID']=FS_Scores_data['Patient_ID'].astype(object)

Age['Data di nascita'] = pd.to_datetime(Age['Data di nascita'])
Relapse_number['relapse'] = pd.to_datetime(Relapse_number['relapse'])

FS_Scores_data.iloc[:,2:10] = FS_Scores_data.iloc[:,2:10].applymap(round_to_nearest)

final_data = FS_Scores_data.merge(Age, left_on='Patient_ID', right_on = "Paziente ID", how='left')
final_data['Age'] = final_data.apply(calculate_age_at_visit, axis=1)

final_data['Relapse_number'] = final_data.apply(lambda row:count_relapses(row['Patient_ID'],row['Date_of_visit']),
                                                axis=1)
final_data.to_csv("Updated_Clinical_Data.csv", index = False) 

In [None]:
Data_after_2013 = final_data[final_data['Date_of_visit'] >= "2013-01-01"]
Data_after_2013.drop(["Paziente ID","CAP","Data di nascita"], axis = 1, inplace=True)  # size = (4491, 18)
Data_after_2013.to_csv("Updated_Clinical_Data.csv", index = False) 

In [13]:
# This celle try to select air pollution data from 4-24-48 weeks ago for each patient
selected_feature_mean = ['PM25_mean', 'PM10_mean', 'CO_mean', 'NO2_mean', 'O3_mean',
                         'SO2_mean', 'FG_mean', 'HU_mean', 'PP_mean', 'QQ_mean', 'RR_mean',
                         'TG_mean', 'TN_mean', 'TX_mean']

matched_data = []

for index, row in Data_after_2013.iterrows():
    patient_id = row['Patient_ID']
    visit_date = row['Date_of_visit']
    
    if patient_id in list(Airquality_data["patient_id"]):
        
        matching_rows = Airquality_data[(Airquality_data['patient_id'] == patient_id) & 
                                        (Airquality_data['start_date'] <= visit_date) & 
                                        (Airquality_data['end_date'] >= visit_date - pd.DateOffset(weeks=4))]
        if len(matching_rows) != 0:
            mean_air_quality = matching_rows[selected_feature_mean].mean(skipna=True).round(3)
            
            slopes = {}
            for pl in selected_feature_mean:
                model = linregress(matching_rows['start_date'].map(datetime.datetime.toordinal),matching_rows[pl])
                slopes["Slop_"+ pl] = round(model.slope,2)


            matched_row = Airquality_data[(Airquality_data['patient_id'] == patient_id) & 
                                          (Airquality_data['start_date'] <= visit_date) & 
                                          (Airquality_data['end_date'] >= visit_date)]
            if len(matched_row) != 1:

                    print(index, patient_id, visit_date, matched_row.shape)

            elif len(matched_row) >= 1:

                merged_row = {
                    'Patient_id': patient_id,
                    'Patient_postcode': matched_row.iloc[0]["patient_postcode"],
                    'Station_postcode': matched_row.iloc[0]["station_postcode"],
                    'Date_of_visit': visit_date,
                    "Start_date": matched_row.iloc[0]["start_date"],
                    "End_date": matched_row.iloc[0]["end_date"],
                    "Start_date_4WeeksAgo": matching_rows.iloc[0]["start_date"],
                    "end_data_4WeeksAgo": matching_rows.iloc[-1]["end_date"],
                    'PM25_mean':mean_air_quality[0],
                    'PM10_mean':mean_air_quality[1],
                    'CO_mean':mean_air_quality[2],
                    'NO2_mean':mean_air_quality[3],
                    'O3_mean':mean_air_quality[4],
                    'SO2_mean':mean_air_quality[5],
                    'FG_mean':mean_air_quality[6],
                    'HU_mean':mean_air_quality[7],
                    'PP_mean':mean_air_quality[8],
                    'QQ_mean':mean_air_quality[9],
                    'RR_mean':mean_air_quality[10],
                    'TG_mean':mean_air_quality[11],
                    'TN_mean':mean_air_quality[12],
                    'TX_mean':mean_air_quality[13],
                }
                merged_row.update(slopes)
                matched_data.append(merged_row)

            else:
                continue

        else:
            continue

matched_data = pd.DataFrame(matched_data)

  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


In [17]:
final_data = pd.merge(Data_after_2013, matched_data, 
                      left_on=['Patient_ID','Date_of_visit'],
                      right_on =['Patient_id', 'Date_of_visit'], 
                      how='inner')

final_data = final_data[['Patient_id','Patient_postcode','Station_postcode',
                         'Date_of_visit','Start_date', 'End_date',
                         "Start_date_4WeeksAgo","end_data_4WeeksAgo",
                         'EDSS_score_assessed_by_clinician','Sex','Age','MS.in.pediatric.age','Relapse_number',
                         'Pyramidal', 'Cerebellar','Thronchioencephalic', 'Sensitive', 'Sphincteric', 'Visual', 
                         'Mental','Deambulation','PM25_mean', 'PM10_mean', 'CO_mean', 'NO2_mean',
                         'O3_mean', 'SO2_mean', 'FG_mean', 'HU_mean', 'PP_mean', 'QQ_mean',
                         'RR_mean', 'TG_mean', 'TN_mean', 'TX_mean',
                         'Slop_PM25_mean','Slop_PM10_mean','Slop_CO_mean',
                         'Slop_NO2_mean','Slop_O3_mean','Slop_SO2_mean',
                         'Slop_FG_mean','Slop_HU_mean','Slop_PP_mean',
                         'Slop_QQ_mean','Slop_RR_mean','Slop_TG_mean',
                         'Slop_TN_mean','Slop_TX_mean']]

Path_Station = ".../postcode_of_station.csv"
postcode_station = pd.read_csv(Path_Station)
postcode_station = postcode_station[postcode_station["country"]==2]


final_data['Start_date'] = pd.to_datetime(final_data['Start_date'])
final_data["End_date"] = pd.to_datetime(final_data["End_date"])
final_data['Start_date_4WeeksAgo'] = pd.to_datetime(final_data['Start_date_4WeeksAgo'])
final_data["end_data_4WeeksAgo"] = pd.to_datetime(final_data["end_data_4WeeksAgo"])
final_data["Date_of_visit"] = pd.to_datetime(final_data["Date_of_visit"])


final_data["Station_postcode"] = final_data["Station_postcode"].astype("str")
final_data = pd.merge(postcode_station, final_data, 
                                   left_on='postcode', right_on = "Station_postcode", 
                                   how='inner')
FS_Pollutant_data = final_data[['Patient_id', 'Patient_postcode', 'Station_postcode',
                                'id_postcode_of_station', 'latitude','longitude',
                                'Date_of_visit', 'Start_date', 'End_date', 'Start_date_4WeeksAgo',
                                'end_data_4WeeksAgo', 'EDSS_score_assessed_by_clinician', 'Sex', 'Age',
                                'MS.in.pediatric.age', 'Relapse_number', 'Pyramidal', 'Cerebellar',
                                'Thronchioencephalic', 'Sensitive', 'Sphincteric', 'Visual', 'Mental',
                                'Deambulation', 'PM25_mean', 'PM10_mean', 'CO_mean', 'NO2_mean',
                                'O3_mean', 'SO2_mean', 'FG_mean', 'HU_mean', 'PP_mean', 'QQ_mean',
                                'RR_mean', 'TG_mean', 'TN_mean', 'TX_mean',
                                'Slop_PM25_mean','Slop_PM10_mean','Slop_CO_mean',
                                'Slop_NO2_mean','Slop_O3_mean','Slop_SO2_mean',
                                'Slop_FG_mean','Slop_HU_mean','Slop_PP_mean',
                                'Slop_QQ_mean','Slop_RR_mean','Slop_TG_mean',
                                'Slop_TN_mean','Slop_TX_mean']]

FS_Pollutant_data.rename(columns={"latitude": "Station_lat", "longitude": "Station_long"},inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [22]:
# impute any cell of slope features with Zero if 
# it is missing. It means that there is no pollutant values for all 5 weeks ago
features_to_impute = ['Slop_PM25_mean','Slop_PM10_mean','Slop_CO_mean',
                      'Slop_NO2_mean','Slop_O3_mean','Slop_SO2_mean',
                      'Slop_FG_mean','Slop_HU_mean','Slop_PP_mean',
                      'Slop_QQ_mean','Slop_RR_mean','Slop_TG_mean',
                      'Slop_TN_mean','Slop_TX_mean']

FS_Pollutant_data[features_to_impute] = FS_Pollutant_data[features_to_impute].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [25]:
FS_Pollutant_data.to_csv("Updated_4weeks_ago_with_Slope.csv", index= False)