In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import spearmanr, pointbiserialr, chi2_contingency
from sklearn.impute import SimpleImputer
from google.colab import files  # Needed for manual file upload in Google Colab

# Step 1: Upload CSV file manually
uploaded = files.upload()  # Opens file uploader
file_path = list(uploaded.keys())[0]  # Get the uploaded file name

# Step 2: Read the dataset
data = pd.read_csv(file_path)

# Step 3: Display first few rows
print("First 5 rows of the dataset:")
print(data.head())

# Step 4: Ask for target column
target_column = input("Enter the target column name: ")

# Step 5: Convert binary categorical columns to numeric
for col in data.columns:
    if data[col].nunique() == 2:  # Binary column
        data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert and replace errors with NaN
        data[col].fillna(data[col].mode()[0], inplace=True)  # Replace NaN with mode

# Step 6: Handle missing values for numeric and categorical features
numeric_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number']).columns

if len(numeric_cols) > 0:
    numeric_imputer = SimpleImputer(strategy='mean')
    data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

if len(categorical_cols) > 0:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# Step 7: Compute correlations
correlation_results = {}

for col in data.columns:
    if col != target_column:
        if data[col].dtype in ['float64', 'int64']:  # Continuous feature
            corr, _ = spearmanr(data[col], data[target_column])
            correlation_results[col] = ("Spearman", corr)

        elif data[col].nunique() == 2:  # Binary feature
            corr, _ = pointbiserialr(data[col], data[target_column])
            correlation_results[col] = ("Point Biserial", corr)

        else:  # Categorical feature
            contingency_table = pd.crosstab(data[col], data[target_column])
            chi2, _, _, _ = chi2_contingency(contingency_table)
            correlation_results[col] = ("Chi-Square", chi2)

# Step 8: Display correlation results
print("\nCorrelation Results:")
for col, (method, corr_value) in correlation_results.items():
    print(f"{col} -> {method} correlation: {corr_value:.4f}")


Saving PCOS_data.csv to PCOS_data (3).csv
First 5 rows of the dataset:
   PCOS (Y/N)   Age (yrs)  Weight (Kg)  Height(Cm)    BMI  Hb(g/dl)  \
0           0          28         44.6        152.0  19.3     10.48   
1           0          36         65.0        161.5  24.9     11.70   
2           1          33         68.8        165.0  25.3     11.80   
3           0          37         65.0        148.0  29.7     12.00   
4           0          25         52.0        161.0  20.1     10.00   

   Cycle(R/I)  Pregnant(Y/N)  No. of abortions    I   beta-HCG(mIU/mL)  ...  \
0           2              0                 0                    1.99  ...   
1           2              1                 0                   60.80  ...   
2           2              1                 0                  494.08  ...   
3           2              0                 0                    1.99  ...   
4           2              1                 0                  801.45  ...   

  Endometrium (mm)  Unnamed

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)  # Replace NaN with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)  # Replace NaN with mode
The behavior will change in pandas 3.0. This inplace method will never work be

ValueError: Columns must be same length as key

In [None]:
# prompt: Spearman: between continuous data features and the taget

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import spearmanr, pointbiserialr, chi2_contingency
from sklearn.impute import SimpleImputer

# Assuming data is already loaded as 'data' and target column is 'target_column'

# ... (previous code remains the same)


# Step 7: Compute Spearman correlations for continuous features
correlation_results = {}

for col in data.columns:
    if col != target_column:
        if pd.api.types.is_numeric_dtype(data[col]):  # Check if the column is numeric
            corr, pval = spearmanr(data[col], data[target_column])
            correlation_results[col] = {'method': 'Spearman', 'correlation': corr, 'p-value': pval}

# Step 8: Display correlation results
print("\nCorrelation Results:")
for col, result in correlation_results.items():
    print(f"{col}: {result['method']} correlation = {result['correlation']:.4f}, p-value = {result['p-value']:.4f}")



Correlation Results:
 Age (yrs): Spearman correlation = -0.1853, p-value = 0.0000
Weight (Kg): Spearman correlation = 0.1950, p-value = 0.0000
Height(Cm) : Spearman correlation = 0.0605, p-value = 0.1603
BMI: Spearman correlation = 0.1964, p-value = 0.0000
Hb(g/dl): Spearman correlation = 0.0967, p-value = 0.0245
Cycle(R/I): Spearman correlation = 0.4012, p-value = 0.0000
Pregnant(Y/N): Spearman correlation = -0.0276, p-value = 0.5223
No. of abortions: Spearman correlation = -0.0344, p-value = 0.4247
  I   beta-HCG(mIU/mL): Spearman correlation = 0.0784, p-value = 0.0685
FSH(mIU/mL): Spearman correlation = -0.1152, p-value = 0.0073
LH(mIU/mL): Spearman correlation = 0.0399, p-value = 0.3539
FSH/LH: Spearman correlation = -0.1183, p-value = 0.0059
Hip(inch): Spearman correlation = 0.1626, p-value = 0.0001
Waist(inch): Spearman correlation = 0.1749, p-value = 0.0000
TSH (mIU/L): Spearman correlation = 0.0157, p-value = 0.7149
PRL(ng/mL): Spearman correlation = 0.0230, p-value = 0.5928
V

  corr, pval = spearmanr(data[col], data[target_column])
