# Cardiovascular Disease Data Analysis Notebook
This notebook analyzes cardiovascular disease data, including data cleaning, visualization, and prediction using machine learning models.

In [None]:
# Cell 1: Installing necessary packages (commented out)
#pip install matplotlib seaborn
#!pip install scikit-learn

In [None]:
# Cell 2: Importing necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Cell 3: Defining the file name for the dataset
file_name = 'Disease_Data.csv'

In [None]:
# Cell 4: Reading the dataset into a DataFrame and displaying the first 20 rows
df = pd.read_csv(file_name, sep=';')

pd.set_option('display.max_columns', None)
df.head(20)

In [None]:
# Cell 5: Displaying unique values in the 'Data_Value_Unit' column
df.Data_Value_Unit.unique()

In [None]:
# Cell 6: Displaying unique values in the 'Break_Out' column
df.Break_Out.unique()

In [None]:
# Cell 7: Displaying unique values in the 'BreakOutId' column
df.BreakOutId.unique()

In [None]:
# Cell 8: Counting unique values in the 'Break_Out_Category' column
df.Break_Out_Category.value_counts()

In [None]:
# Cell 9: Counting occurrences of each year in the 'YearStart' column
df.YearStart.value_counts()

In [None]:
# Cell 10: Counting missing values in each column
df.isna().sum()

In [None]:
# Cell 11: Displaying data types of each column
df.dtypes

In [None]:
# Cell 12: Dropping rows where 'Data_Value' or 'Data_Value_Alt' columns have empty strings, and removing rows with NaN values in 'Data_Value' column
df = df.drop(df[df['Data_Value'] == ''].index)
df = df.drop(df[df['Data_Value_Alt'] == ''].index)
df.dropna(subset=['Data_Value'], inplace=True)

In [None]:
# Cell 13: Removing commas from 'Data_Value' and 'Data_Value_Alt' columns and converting them to float data type
df['Data_Value'] = df['Data_Value'].str.replace(',', '')
df['Data_Value_Alt'] = df['Data_Value_Alt'].str.replace(',', '')

df['Data_Value'] = df['Data_Value'].astype(float)
df['Data_Value_Alt'] = df['Data_Value_Alt'].astype(float)

In [None]:
# Cell 14: Displaying the shape of the DataFrame (number of rows and columns)
df.shape

In [None]:
# Cell 15: Converting column names to a list and displaying the list
columns = df.columns.to_list()
columns

In [None]:
# Cell 16: Counting missing values in each column again
df.isna().sum()

In [None]:
# Cell 17: Removing duplicate rows from the DataFrame
df.drop_duplicates()

In [None]:
# Cell 18: Displaying information about the DataFrame (including number of non-null entries and data types)
df.info()

In [None]:
# Cell 19: Displaying column names of the DataFrame
df.columns

In [None]:
# Cell 20: Saving a backup of the original dataset to a CSV file
df.to_csv('original_dataset_backup.csv', index=False)

In [None]:
# Cell 21: Defining a list of columns to drop from the DataFrame
columns_to_drop = ['RowId', 'LocationDesc', 'DataSource', 'PriorityArea1', 'PriorityArea2', 'PriorityArea3', 'PriorityArea4', 'Question', 'Data_Value_Type', 'Data_Value_Unit', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'ClassId', 'TopicId', 'QuestionId', 'BreakOutCategoryId', 'BreakOutId', 'Geolocation', 'LocationId']

In [None]:
# Cell 22: Dropping the specified columns from the DataFrame
df = df.drop(columns=columns_to_drop)

In [None]:
# Cell 23: Displaying column names of the modified DataFrame
df.columns

In [None]:
# Cell 24 (truncated): Defining a function to analyze and predict data using Linear Regression, and displaying accuracy results for different demographic categories
accuracy_gender_linear = analyze_and_predict_linear(gender_filtered_df, 'Major Cardiovascular Disease', 'Break_Out', 'Data_Value', 'Gender')
print(accuracy_gender_linear)

accuracy_age_linear = analyze_and_predict_linear(age_filtered_df, 'Major Cardiovascular Disease', 'Break_Out', 'Data_Value', 'Age')
print(accuracy_age_linear)

accuracy_race_linear = analyze_and_predict_linear(race_filtered_df, 'Major Cardiovascular Disease', 'Break_Out', 'Data_Value', 'Race')
print(accuracy_race_linear)

In [None]:
# Cell 25: Displaying the first few rows of the filtered DataFrame
overall_filtered_df.head()

In [None]:
# Cell 26: Counting occurrences of each year in the filtered DataFrame and displaying the result
value_counts = filtered_df['YearStart'].value_counts()
print(value_counts)

In [None]:
# Cell 27: Separating the dataset into blocks of columns for each state and year
# Number of columns per state per year
columns_per_state_year = 20

# Total number of states
num_states = 50

# Total number of years
num_years = 22

# Separating the dataset by blocks of columns for each state and year
blocks = []
for state_index in range(num_states):
    for year_index in range(num_years):
        start_col = (state_index * num_years * columns_per_state_year) + (year_index * columns_per_state_year)
        end_col = start_col + columns_per_state_year
        block = df.iloc[:, start_col:end_col]
        blocks.append(block)

In [None]:
# Cell 28: Plotting histograms for numerical columns in the age-filtered DataFrame
for column in age_filtered_df.select_dtypes(include='number').columns:
    plt.figure()
    plt.hist(age_filtered_df[column], bins=30)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column}')
    plt.show()