#Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

In [2]:
!git --version

git version 2.34.1


In [3]:
cd /content

/content


In [4]:
!git clone https://GopiErla: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@github.com/GopiErla/Statistics-and-Trends-20-.git


Cloning into 'Statistics-and-Trends-20-'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), done.


In [5]:
!git config --global credential.helper cache

In [6]:
%cd Statistics-and-Trends-20-

/content/Statistics-and-Trends-20-


In [7]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


# Load the dataset

In [8]:

file_path = 'heart_disease_uci.csv'
heart_disease_data = pd.read_csv(file_path)
heart_disease_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_uci.csv'

# Display basic information about the dataset

In [None]:
heart_disease_data.info()

#Data Cleaning

In [None]:
# Handling missing values by imputing them with appropriate statistics
# Using median for continuous numerical data and mode for categorical data

# First, check for missing values
missing_values = heart_disease_data.isnull().sum()
print("missing_values before handling")
print(missing_values)

In [None]:
# Calculating median for numeric columns and mode for categorical columns
for column in heart_disease_data.columns:
    if heart_disease_data[column].dtype == 'object':
        # For categorical data, use the mode (most frequent value)
        mode_value = heart_disease_data[column].mode()[0]
        heart_disease_data[column].fillna(mode_value, inplace=True)
    else:
        # For numeric data, use the median
        median_value = heart_disease_data[column].median()
        heart_disease_data[column].fillna(median_value, inplace=True)

# Check for missing values after imputation
new_missing_values = heart_disease_data.isnull().sum()
print("missing_values after handling")
print(new_missing_values)


#Descriptive_stats

In [None]:
heart_disease_data.describe()

In [None]:
# Calculating the correlation matrix to understand relationships between variables
def correlation_matrix(data):
    """
    The box plot you provided illustrates the distribution of cholesterol levels among
    individuals grouped by their heart disease status in diffrent gender.
    """
    numerical_data = heart_disease_data.select_dtypes(include=[np.number]) # Select only numerical columns

    correlation_matrix = data.corr()
    print("Correlation Analysis:")
    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', cbar=True, linewidths=0.5)
    plt.title('Correlation Matrix of Scores', fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

correlation_matrix(heart_disease_data)


#visualization

##Scatter Plot

In [None]:
# Creating a scatter plot for Age vs Maximum Heart Rate
def scatter_plot_age_thalch(data):
    """
    Generates a scatter plot to visualize the relationship between age and maximum heart rate
    across individuals, colored by their heart disease status.

    The scatter plot helps in identifying any potential patterns or trends that suggest how age
    and maximum heart rate are associated with the presence of heart disease.
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='age', y='thalch', hue='num', data=heart_disease_data, palette='viridis')
    plt.title('Age vs Maximum Heart Rate with Heart Disease Status')
    plt.xlabel('Age')
    plt.ylabel('Maximum Heart Rate')
    plt.legend(title='Heart Disease Status', labels=['No', 'Yes'])
    plt.grid(True)
    plt.show()
scatter_plot_age_thalch(heart_disease_data)

##Bar Plot

In [None]:
# Creating a bar chart for Heart Disease Presence by Age Group
def bar_chart_age_group(data):
    """
    Generates a bar chart to visualize the count of individuals with and without heart disease
    across specified age groups. This visualization helps in understanding the distribution
    of heart disease prevalence among different age demographics.
    """

    # Creating age groups for bar chart analysis
    heart_disease_data['age_group'] = pd.cut(heart_disease_data['age'], bins=[0, 30, 40, 50, 60, 70, 80], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70+'])

    plt.figure(figsize=(10, 6))
    sns.countplot(x='age_group', hue='num', data=heart_disease_data, palette='coolwarm')
    plt.title('Heart Disease Presence by Age Group')
    plt.xlabel('Age Group')
    plt.ylabel('Count')
    plt.legend(title='Heart Disease Status', labels=['No', 'Yes'])
    plt.show()

bar_chart_age_group(heart_disease_data)

#Box plot

In [None]:
# Creating a box plot for Cholesterol Levels by Heart Disease Status
def box_plot_chol(data):
    """
    Generates a box plot to visualize the distribution of cholesterol levels among individuals,
    categorized by their heart disease status and differentiated by gender. This visualization
    helps in identifying trends or differences in cholesterol levels based on heart disease
    presence and gender.
    """
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='num', y='chol', data=heart_disease_data, palette='autumn',hue='sex')
    plt.title('Cholesterol Levels by Heart Disease Status in diffrent Gender')
    plt.xlabel('Heart Disease Status')
    plt.ylabel('Cholesterol Levels')
    plt.grid(True)
    plt.show()

box_plot_chol(heart_disease_data)


Advance Statistics

In [None]:
def statistical_analysis(data):
    numerical_data = data.select_dtypes(include=[np.number]) # Select only numerical columns
    # Additional statistics
    skewness = numerical_data.skew()
    kurtosis = numerical_data.kurt()
    print("Skewness:\n", skewness)
    print("------------------------")
    print("Kurtosis:\n", kurtosis)



statistical_analysis(heart_disease_data)


In [None]:
 def independent_t_test(heart_disease_data):
    """
    Performs an independent t-test between cholesterol levels of individuals with and without
    heart disease to determine if there are statistically significant differences between
    the two groups.
    """
    # Splitting the data into two groups
    group_with_disease = heart_disease_data[heart_disease_data['num'] == 1]['chol']
    group_without_disease = heart_disease_data[heart_disease_data['num'] == 0]['chol']

    # Performing an independent t-test
    t_stat, p_value = stats.ttest_ind(group_with_disease, group_without_disease, equal_var=False)
    print(f"T-test results -- T-statistic: {t_stat}, P-value: {p_value}")
independent_t_test(heart_disease_data)

In [None]:
def chi_square_test(heart_disease_data):
    """
    Performs a chi-square test of independence to determine if there is a statistically significant
    association between gender and the presence of heart disease.

    The test is applied to a contingency table created from the 'sex' and 'num' columns of the
    provided DataFrame.
    """
    # Creating a contingency table for 'sex' and 'num' (heart disease presence)
    contingency_table = pd.crosstab(heart_disease_data['sex'], heart_disease_data['num'])

    # Performing the chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test results -- Chi2: {chi2}, P-value: {p}")
chi_square_test(heart_disease_data)