In [32]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import os

### Load Data

In [33]:
#Read in csv files of 3 datasets 
obesity = pd.read_csv('Data/obesity.csv')
anxiety = pd.read_csv('Data/anxiety_attacks.csv')
panic_attacks = pd.read_csv('Data/panic_attacks.csv')

#Test to ensure datasets were imported correctly. 
#Remove # in front of data set you want to test
obesity
#anxiety
#panic_attacks

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


### Clean the Data

In [67]:
#check for missing values
print(obesity.isnull().any().any())
print(anxiety.isnull().any().any())
print(panic_attacks.isnull().any().any())

#drop rows where values are missing/unwanted
o_dropped_values = obesity[~obesity.Obesity.isin(['Normal_Weight', 'Insufficient_Weight'])] 
p_dropped_values= panic_attacks.dropna()
print(p_dropped_values.isnull().any().any())

#Dropped columns Variables
o_columns_dropped = ['Height', 'Weight', 'family_history', 'FAVC', 'FCVC',
       'NCP', 'CAEC', 'CH2O', 'SCC', 'TUE', 'MTRANS', 'Obesity']
a_columns_dropped = ['Occupation', 'Sleep Hours','Caffeine Intake (mg/day)', 
        'Family History of Anxiety', 'Stress Level (1-10)',
        'Heart Rate (bpm during attack)', 'Breathing Rate (breaths/min)',
        'Sweating Level (1-5)', 'Dizziness', 'Medication',
        'Therapy Sessions (per month)', 'Recent Major Life Event',
        'Diet Quality (1-10)', 'Severity of Anxiety Attack (1-10)']
p_columns_dropped = ['Panic_Attack_Frequency', 'Duration_Minutes','Trigger', 
        'Heart_Rate', 'Sweating', 'Shortness_of_Breath', 'Dizziness',
        'Chest_Pain', 'Trembling', 'Medical_History', 'Medication',
        'Caffeine_Intake', 'Sleep_Hours',
        'Therapy', 'Panic_Score']

#Dropping variables and renaming kept columns
o_cleaned = o_dropped_values.drop(o_columns_dropped, axis=1).rename(columns={'SMOKE':'Smoker','FAF':'Physical Activity', 'CALC':'Alcohol Consumption'})
print(o_cleaned.columns)

a_cleaned = anxiety.drop(a_columns_dropped, axis=1).rename(columns={'Physical Activity (hrs/week)':'Physical Activity', 'Alcohol Consumption (drinks/week)': 'Alcohol Consumption', 'Smoking': 'Smoker'})
print(a_cleaned.columns)

p_cleaned = p_dropped_values.drop(p_columns_dropped, axis=1).rename(columns={'Exercise_Frequency': 'Physical Activity', 'Alcohol_Consumption': 'Alcohol Consumption', 'Smoking': 'Smoker'})
print(p_cleaned.columns)

False
False
True
False
Index(['Gender', 'Age', 'Smoker', 'Physical Activity', 'Alcohol Consumption'], dtype='object')
Index(['ID', 'Age', 'Gender', 'Physical Activity', 'Alcohol Consumption',
       'Smoker'],
      dtype='object')
Index(['ID', 'Age', 'Gender', 'Physical Activity', 'Alcohol Consumption',
       'Smoker'],
      dtype='object')


In [68]:
o_dropped_values

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
10,Male,26.000000,1.850000,105.000000,yes,yes,3.0,3.0,Frequently,no,3.000000,no,2.000000,2.000000,Sometimes,Public_Transportation,Obesity_Type_I
11,Female,21.000000,1.720000,80.000000,yes,yes,2.0,3.0,Frequently,no,2.000000,yes,2.000000,1.000000,Sometimes,Public_Transportation,Overweight_Level_II
13,Male,41.000000,1.800000,99.000000,no,yes,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Automobile,Obesity_Type_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III
