The Dataset is Downloaded on 28 Nov 2024

In [5]:
import pandas as pd

# Load the dataset with a different encoding
file_path = '2019VAERSDATA.csv'  # Update with your actual file path
df = pd.read_csv(file_path, encoding='latin1')  # Try 'latin1' or 'ISO-8859-1'

In [6]:
# Select only the required columns
columns_to_keep = ['AGE_YRS', 'SEX', 'SYMPTOM_TEXT', 'SPLTTYPE']
filtered_df = df[columns_to_keep]

# Save the filtered dataset to a new CSV file
filtered_df.to_csv('Filtered_VAERS_Data.csv', index=False)

print("Filtered dataset saved as 'Filtered_VAERS_Data.csv'")


Filtered dataset saved as 'Filtered_VAERS_Data.csv'


In [7]:
# Load the cleaned dataset
filtered_df = pd.read_csv('Filtered_VAERS_Data.csv')

# Convert AGE_YRS to numeric, coercing invalid entries to NaN
filtered_df['AGE_YRS'] = pd.to_numeric(filtered_df['AGE_YRS'], errors='coerce')

# Keep rows where AGE_YRS is >= 18 and is an integer
filtered_df = filtered_df[(filtered_df['AGE_YRS'] >= 18) & (filtered_df['AGE_YRS'].apply(float.is_integer))]

# Save the cleaned dataset to a new CSV file
filtered_df.to_csv('Filtered_Cleaned_Valid_Age_VAERS_Data.csv', index=False)

print("Cleaned dataset saved as 'Filtered_Cleaned_Valid_Age_VAERS_Data.csv'")


Cleaned dataset saved as 'Filtered_Cleaned_Valid_Age_VAERS_Data.csv'


In [8]:
# Filter rows where 'SPLTTYPE' contains 'PFIZER'
filtered_df = filtered_df[filtered_df['SPLTTYPE'].str.contains('PFIZER', na=False)]

# Save the filtered dataset to a new CSV file
filtered_df.to_csv('Filtered_Pfizer_VAERS_Data.csv', index=False)

print("Filtered dataset saved with rows containing 'PFIZER' in the 'SPLTTYPE' column.")


Filtered dataset saved with rows containing 'PFIZER' in the 'SPLTTYPE' column.


In [9]:
# Define age groups
bins = [18, 25, 35, 50, 60, 80, 100]
labels = ['18-25', '26-35', '36-50', '51-60', '61-80', '81-100']

# Create a new column for age groups
filtered_df['AGE_GROUP'] = pd.cut(filtered_df['AGE_YRS'], bins=bins, labels=labels, right=False)

# Save the updated dataset to a new CSV file
filtered_df.to_csv('Pfizer_VAERS_with_Age_Group.csv', index=False)

In [11]:
# Load the uploaded dataset
file_path = 'Pfizer_VAERS_with_Age_Group.csv'
df = pd.read_csv(file_path)

# Define mappings for adverse effects and organs affected
adverse_effect_keywords = {
    'fever': 'Fever',
    'headache': 'Headache',
    'pain': 'Pain',
    'nausea': 'Nausea',
    'dizziness': 'Dizziness',
    'rash': 'Rash',
    'chills': 'Chills',
    'swelling': 'Swelling',
    'vomiting': 'Vomiting',
    'fatigue': 'Fatigue'
}

organs_affected_keywords = {
    'fever': 'Systemic',
    'headache': 'Nervous System',
    'pain': 'Musculoskeletal System',
    'nausea': 'Digestive System',
    'dizziness': 'Nervous System',
    'rash': 'Skin',
    'chills': 'Systemic',
    'swelling': 'Skin',
    'vomiting': 'Digestive System',
    'fatigue': 'Systemic'
}

# Function to extract adverse effects
def extract_adverse_effects(text):
    if pd.isna(text):
        return '0'
    effects = [effect for keyword, effect in adverse_effect_keywords.items() if keyword in text.lower()]
    return ', '.join(effects) if effects else '0'

# Function to extract organs affected
def extract_organs_affected(text):
    if pd.isna(text):
        return '0'
    organs = [organ for keyword, organ in organs_affected_keywords.items() if keyword in text.lower()]
    return ', '.join(organs) if organs else '0'

# Apply the functions to create new columns
df['Adverse Effects'] = df['SYMPTOM_TEXT'].apply(extract_adverse_effects)
df['Organs Affected'] = df['SYMPTOM_TEXT'].apply(extract_organs_affected)

# Save the updated dataset
output_path = 'Pfizer_VAERS_with_Adverse_Effects_and_Organs.csv'
df.to_csv(output_path, index=False)
