## **PCOS Analysis**

In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Load the dataset
df=pd.read_csv('Dataset/Cleaned-Data.csv')
df.head(2)


Unnamed: 0,Age,Weight_kg,Height_ft,Marital_Status,PCOS,Family_History_PCOS,Menstrual_Irregularity,Hormonal_Imbalance,Hyperandrogenism,Hirsutism,...,Diet_Multivitamin,Vegetarian,Exercise_Frequency,Exercise_Type,Exercise_Duration,Sleep_Hours,Stress_Level,Smoking,Exercise_Benefit,PCOS_Medication
0,20-25,66.0,157.48,Unmarried,No,No,Yes,No,No,No,...,0,No,Rarely,"Cardio (e.g., running, cycling, swimming)",30 minutes,Less than 6 hours,No,No,Somewhat,No.
1,Less than 20,56.0,165.1,Unmarried,No,No,No,No,No,No,...,0,No,Daily,No Exercise,Less than 30 minutes,6-8 hours,No,No,Somewhat,No.


In [4]:
# Shape of the dataset
df.shape


(173, 36)

In [5]:
# Check for missing data
df.isnull().sum()

Age                           0
Weight_kg                     0
Height_ft                     0
Marital_Status                0
PCOS                          0
Family_History_PCOS           0
Menstrual_Irregularity        0
Hormonal_Imbalance            0
Hyperandrogenism              0
Hirsutism                     0
Mental_Health                 0
Conception_Difficulty         0
Insulin_Resistance            0
Diabetes                      0
Childhood_Trauma              0
Cardiovascular_Disease        0
Diet_Bread_Cereals            0
Diet_Milk_Products            0
Diet_Fruits                   0
Diet_Vegetables               0
Diet_Starchy_Vegetables       0
Diet_NonStarchy_Vegetables    0
Diet_Fats                     0
Diet_Sweets                   0
Diet_Fried_Food               0
Diet_Tea_Coffee               0
Diet_Multivitamin             0
Vegetarian                    0
Exercise_Frequency            0
Exercise_Type                 0
Exercise_Duration             0
Sleep_Ho

In [6]:
# Check for duplicate values
df.duplicated().sum()

np.int64(0)

In [7]:
# Get the information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         173 non-null    object 
 1   Weight_kg                   173 non-null    float64
 2   Height_ft                   173 non-null    float64
 3   Marital_Status              173 non-null    object 
 4   PCOS                        173 non-null    object 
 5   Family_History_PCOS         173 non-null    object 
 6   Menstrual_Irregularity      173 non-null    object 
 7   Hormonal_Imbalance          173 non-null    object 
 8   Hyperandrogenism            173 non-null    object 
 9   Hirsutism                   173 non-null    object 
 10  Mental_Health               173 non-null    object 
 11  Conception_Difficulty       173 non-null    object 
 12  Insulin_Resistance          173 non-null    object 
 13  Diabetes                    173 non

In [8]:
# Get statistical summary of the dataset
df.describe()

Unnamed: 0,Weight_kg,Height_ft,Diet_Bread_Cereals,Diet_Milk_Products,Diet_Fruits,Diet_Vegetables,Diet_Starchy_Vegetables,Diet_NonStarchy_Vegetables,Diet_Fats,Diet_Sweets,Diet_Fried_Food,Diet_Tea_Coffee,Diet_Multivitamin
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,56.549711,160.636647,5.445087,3.647399,3.034682,3.439306,3.069364,2.450867,4.531792,3.618497,3.179191,4.549133,1.393064
std,12.025369,7.287373,2.116832,2.54893,2.048527,1.769596,1.857054,1.709808,2.293919,2.255253,2.019448,2.785795,2.32688
min,36.0,124.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.0,157.48,4.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,0.0
50%,55.0,160.02,7.0,3.0,3.0,3.0,3.0,2.0,5.0,3.0,3.0,6.0,0.0
75%,64.0,165.1,7.0,7.0,4.0,5.0,4.0,3.0,7.0,6.0,5.0,7.0,2.0
max,115.0,182.88,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


### **Label Encoding of Categorical Variables**

In [9]:
# Label encoding of categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns

data_encoded = df.copy()

label_encoder = LabelEncoder()
for col in categorical_cols:
    data_encoded[col] = label_encoder.fit_transform(df[col])

In [10]:
# Feature  scaling
scaler= MinMaxScaler()
numerical_cols= df.select_dtypes(include=[np.number]).columns

data_encoded[numerical_cols] = scaler.fit_transform(data_encoded[numerical_cols])

In [11]:
# Preview encoded data
data_encoded.head()

Unnamed: 0,Age,Weight_kg,Height_ft,Marital_Status,PCOS,Family_History_PCOS,Menstrual_Irregularity,Hormonal_Imbalance,Hyperandrogenism,Hirsutism,...,Diet_Multivitamin,Vegetarian,Exercise_Frequency,Exercise_Type,Exercise_Duration,Sleep_Hours,Stress_Level,Smoking,Exercise_Benefit,PCOS_Medication
0,0,0.379747,0.565217,1,0,0,1,0,0,0,...,0.0,0,4,0,0,2,0,0,2,5
1,5,0.253165,0.695652,1,0,0,0,0,0,0,...,0.0,0,2,9,2,0,0,0,2,5
2,5,0.670886,0.73913,1,0,1,0,0,0,2,...,0.0,0,4,0,2,0,1,0,2,5
3,0,0.240506,0.608696,1,0,1,0,2,0,2,...,0.142857,0,3,9,4,0,1,0,2,5
4,5,0.240506,0.608696,1,0,0,0,0,0,0,...,0.0,0,2,0,1,0,1,0,1,5


### **Feature Engineering**

In [13]:
# BMI Column
df['Height_m'] = df['Height_ft'] * 0.3048
df['BMI'] = df['Weight_kg'] / (df['Height_m'] ** 2)

df['BMI'] = scaler.fit_transform(df[['BMI']])

In [14]:
# Diet Score
df['Diet_Score'] = (
    2 * df['Diet_Fruits'] +
    2 * df['Diet_Vegetables'] +
    1 * df['Diet_Bread_Cereals'] +
    1 * df['Diet_Milk_Products'] +
    -2 * df['Diet_Sweets'] +
    -2 * df['Diet_Fried_Food']+
    2 * df['Diet_NonStarchy_Vegetables']+
    -1  * df['Diet_Starchy_Vegetables']+
    -1 * df['Diet_Fats']+
    -1 *df['Diet_Fried_Food']+
    -1*df['Diet_Tea_Coffee']+
    2*df['Diet_Multivitamin']
)
data_encoded['Normalized_Diet_Score'] = scaler.fit_transform(df[['Diet_Score']])


In [15]:
# Sleep Score
sleep_score_map = {
    '6-8 hours': 3,
    '9-12 hours': 2,
    'Less than 6 hours': 1,
    'More than 12 hours': 1
}
df['Sleep_Score'] = df['Sleep_Hours'].map(sleep_score_map)


In [16]:
print(df['Exercise_Frequency'].value_counts())
print(df['Exercise_Duration'].value_counts())

Exercise_Frequency
Rarely              80
1-2 Times a Week    28
Never               26
3-4 Times a Week    20
Daily               19
Name: count, dtype: int64
Exercise_Duration
Not Applicable          68
Less than 30 minutes    51
30 minutes              28
30 minutes to 1 hour    18
More than 30 minutes     8
Name: count, dtype: int64


In [17]:
# Exercise Frequency Score
exercise_frequency_map={
    'Rarely':1,
    'Never':0,
    '1-2 Times a Week':2,
    '3-4 Times a Week':3,
    'Daily':4
}
df['Exercise_Frequency_Score'] = df['Exercise_Frequency'].map(exercise_frequency_map)


In [18]:
# Exercise Duration Score
exercise_duration_score={
   'Not Applicable':0,
   'Less than 30 minutes': 1,
   '30 minutes':2,
   '30 minutes to 1 hour':3,
   'More than 30 minutes':4
}
df['Exercise_Duration_Score']=df['Exercise_Duration'].map(exercise_duration_score)

In [19]:
# Total Exercise Score
df['Exercise_Score']=df['Exercise_Frequency_Score']+df['Exercise_Duration_Score']

In [20]:
# Healthy Lifestyle Score
df['Healthy_Lifestyle_Score']=df['Diet_Score']+df['Exercise_Score']+df['Sleep_Score']

In [21]:
df.columns

Index(['Age', 'Weight_kg', 'Height_ft', 'Marital_Status', 'PCOS',
       'Family_History_PCOS', 'Menstrual_Irregularity', 'Hormonal_Imbalance',
       'Hyperandrogenism', 'Hirsutism', 'Mental_Health',
       'Conception_Difficulty', 'Insulin_Resistance', 'Diabetes',
       'Childhood_Trauma', 'Cardiovascular_Disease', 'Diet_Bread_Cereals',
       'Diet_Milk_Products', 'Diet_Fruits', 'Diet_Vegetables',
       'Diet_Starchy_Vegetables', 'Diet_NonStarchy_Vegetables', 'Diet_Fats',
       'Diet_Sweets', 'Diet_Fried_Food', 'Diet_Tea_Coffee',
       'Diet_Multivitamin', 'Vegetarian', 'Exercise_Frequency',
       'Exercise_Type', 'Exercise_Duration', 'Sleep_Hours', 'Stress_Level',
       'Smoking', 'Exercise_Benefit', 'PCOS_Medication', 'Height_m', 'BMI',
       'Diet_Score', 'Sleep_Score', 'Exercise_Frequency_Score',
       'Exercise_Duration_Score', 'Exercise_Score', 'Healthy_Lifestyle_Score'],
      dtype='object')

In [22]:
# save the data to new csv
df.to_csv('new_data.csv', index=False)


In [24]:
df2=pd.read_csv('new_data.csv')
df2.columns
df2.dtypes
df2.shape

(173, 44)