In [None]:
import numpy as np
import pandas as pd

In [None]:
import pandas as pd
import zipfile

# Open the zip file
with zipfile.ZipFile('/content/archive (3).zip') as zf:
  # Get a list of files in the zip
  file_list = zf.namelist()

  # Filter for the desired file (assuming it's 'framingham_subamostrado.xlsx')
  excel_file = [file for file in file_list if file.endswith('framingham_subamostrado.xlsx')][0]

  # Read the Excel file into a pandas DataFrame
  df = pd.read_excel(zf.open(excel_file))

# Now you can work with the DataFrame 'df'
print(df.head())

In [None]:
df

# **01.Data Preprocessing**

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.fillna(method='ffill',inplace=True)

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
sns.boxplot(data=df[['clinica_colest_total', 'clinica_pressao_si', 'clinica_pressao_di']])


# **02.Exploratory Data Analysis(EDA)**

In [None]:
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
correlation_matrix = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')


In [None]:
df['demo_idade'].hist(bins=20)  # Age distribution


In [None]:
sns.boxplot(x='risco_doenca', y='clinica_colest_total', data=df)


# **03.Feature Engineering**

In [None]:
df['demo_sexo'] = df['demo_sexo'].map({0: 'Female', 1: 'Male'})


In [None]:
df['demo_sexo']

In [None]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df['demo_sexo'], df['risco_doenca'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square test p-value: {p}")


# **04.Stastical Analysis**

In [None]:
from scipy.stats import ttest_ind
group1 = df[df['risco_doenca'] == 0]['clinica_colest_total']
group2 = df[df['risco_doenca'] == 1]['clinica_colest_total']
t_stat, p_value = ttest_ind(group1, group2)
print(f"T-test p-value: {p_value}")


# **05.Modling: Predicting Disease Risk**

In [None]:
X = df.drop(columns=['risco_doenca'])
y = df['risco_doenca']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train, X_test, y_train, y_test

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


encoder = LabelEncoder()


X_train['demo_sexo'] = encoder.fit_transform(X_train['demo_sexo'])
X_test['demo_sexo'] = encoder.transform(X_test['demo_sexo']) # Use the same encoder on the test set


from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier




encoder = LabelEncoder()


X['demo_sexo'] = encoder.fit_transform(X['demo_sexo'])


model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", scores)

# **06.visualization**

In [None]:

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt # Make sure to import pyplot




encoder = LabelEncoder()


X['demo_sexo'] = encoder.fit_transform(X['demo_sexo'])


model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", scores)


model.fit(X, y)


fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
