In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, r2_score, confusion_matrix

# Load dataset
pima_df = pd.read_csv('./datasets/diabetes.csv')

# Quick overview of dataset
print(pima_df.shape)
print(pima_df.info())
print(pima_df.sample(5))
print(pima_df.describe())

# Function to compute univariate statistics
def univariate_analysis(df):
    stats_dict = {col: {
        'Mean': df[col].mean(),
        'Median': df[col].median(),
        'Mode': df[col].mode()[0],
        'Variance': df[col].var(),
        'Standard Deviation': df[col].std(),
        'Skewness': df[col].skew(),
        'Kurtosis': df[col].kurtosis()
    } for col in df.columns}
    
    return pd.DataFrame(stats_dict).T

# Univariate analysis results
univariate_results = univariate_analysis(pima_df)
print(univariate_results)

# Bivariate Analysis - Linear Regression
def linear_regression(df, predictors, target):
    X = df[predictors]
    y = df[target]
    
    X = sm.add_constant(X)  # Add constant for intercept
    
    # Fit and summarize model
    model = sm.OLS(y, X).fit()
    print("\nBivariate Analysis - Linear Regression:")
    print(model.summary())

# Bivariate Analysis - Logistic Regression
def logistic_regression(df, predictors, target):
    X = df[predictors]
    y = df[target]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit and predict using Logistic Regression
    log_model = LogisticRegression(max_iter=200)
    log_model.fit(X_train, y_train)
    predictions = log_model.predict(X_test)
    
    # Output coefficients and performance
    print("\nBivariate Analysis - Logistic Regression:")
    print("Logistic Regression Coefficients:\n", log_model.coef_)
    print("Accuracy on Test Set: ", accuracy_score(y_test, predictions))

# Multiple Regression Analysis
def multiple_regression(df, target):
    X = df.drop(columns=[target])
    y = df[target]
    
    X = sm.add_constant(X)  # Add constant for intercept
    
    multi_model = sm.OLS(y, X).fit()
    print("\nMultiple Regression Analysis:")
    print(multi_model.summary())

# Run analyses
predictors = ['Glucose', 'BMI', 'Age']
target = 'Outcome'

linear_regression(pima_df, predictors, target)
logistic_regression(pima_df, predictors, target)
multiple_regression(pima_df, target)


(768, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
239            0      104             76              0        0  18.4   
473            7      136             90              0        0  29.9   
28  