In [3]:
import pandas as pd

# Load the first dataset
df1 = pd.read_csv("diabetes.csv")
print("First 5 rows of df1:")
print(df1.head())

print("\nColumns in df1:")
print(df1.columns)

print("\nMedian of df1:")
print(df1.median())

print("\nMode of df1:")
print(df1.mode().iloc[0])

print("\nFrequency counts for each column in df1:")
for col in df1.columns:
    print(f"{col}:")
    print(df1[col].value_counts())

print("\nMean of df1:")
print(df1.mean())

print("\nVariance of df1:")
print(df1.var())

print("\nSkewness of df1:")
print(df1.skew())

print("\nKurtosis of df1:")
print(df1.kurt())

print("\nStandard Deviation of df1:")
print(df1.std())

# Creating a summary dataframe for univariate analysis
result_df = pd.DataFrame({
    'Mean': df1.mean(),
    'Median': df1.median(),
    'Mode': df1.mode().iloc[0],  # Take the first mode across all columns
    'Variance': df1.var(),
    'Standard Deviation': df1.std(),
    'Skewness': df1.skew(),
    'Kurtosis': df1.kurt()
})
print("\nSummary statistics for df1:")
print(result_df)

# Bivariate Analysis: Linear and Logistic Regression Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

X = df1.drop('Diabetes_binary', axis=1)
y = df1['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Linear Regression
linear = LinearRegression()
linear.fit(X_train, y_train)
linear_pred = linear.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error

# Evaluate the linear regression model
r2 = r2_score(y_test, linear_pred)
mse = mean_squared_error(y_test, linear_pred)

print(f"\nLinear Regression - R2 Score: {r2:.4f}")
print(f"Linear Regression - Mean Squared Error: {mse:.4f}")

# Standardize the data for Logistic Regression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
logistic = LogisticRegression(max_iter=500)
logistic.fit(X_train_scaled, y_train)
logistic_pred = logistic.predict(X_test_scaled)

accuracy = accuracy_score(y_test, logistic_pred)
print(f"\nLogistic Regression Accuracy: {accuracy:.4f}")

# Load the second dataset (Pima Indians Diabetes dataset)
df2 = pd.read_csv("pima_indian_diabetes.csv")
print("\nFirst 5 rows of df2:")
print(df2.head())

print("\nMissing values in df2:")
print(df2.isnull().sum())

# Univariate Analysis for df2
result_df2 = pd.DataFrame({
    'Mean': df2.mean(),
    'Median': df2.median(),
    'Mode': df2.mode().iloc[0],  # Take the first mode across all columns
    'Variance': df2.var(),
    'Standard Deviation': df2.std(),
    'Skewness': df2.skew(),
    'Kurtosis': df2.kurt()
})
print("\nSummary statistics for df2:")
print(result_df2)

# Bivariate Analysis: Linear and Logistic Regression Modeling for df2
X = df2.drop('Outcome', axis=1)
y = df2['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Linear Regression for df2
linear_df2 = LinearRegression()
linear_df2.fit(X_train, y_train)
linear_pred_df2 = linear_df2.predict(X_test)

# Evaluate the linear regression model
r2_df2 = r2_score(y_test, linear_pred_df2)
mse_df2 = mean_squared_error(y_test, linear_pred_df2)

print(f"\nLinear Regression - R2 Score (df2): {r2_df2:.4f}")
print(f"Linear Regression - Mean Squared Error (df2): {mse_df2:.4f}")

# Standardize the data for Logistic Regression
scaler = StandardScaler()
X_train_scaled_df2 = scaler.fit_transform(X_train)
X_test_scaled_df2 = scaler.transform(X_test)

# Logistic Regression for df2
logistic_df2 = LogisticRegression(max_iter=500)
logistic_df2.fit(X_train_scaled_df2, y_train)
logistic_pred_df2 = logistic_df2.predict(X_test_scaled_df2)

accuracy_df2 = accuracy_score(y_test, logistic_pred_df2)
print(f"\nLogistic Regression Accuracy (df2): {accuracy_df2:.4f}")


First 5 rows of df1:
   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0   