In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# Step 1: Load the Dataset
file_path = r"pcos_dataset.csv"
students_df = pd.read_csv(file_path)

In [7]:
# Display the first few rows of the dataset to check the structure
print("Original Data:")
print(students_df)

Original Data:
     Age   BMI  Menstrual_Irregularity  Testosterone_Level(ng/dL)  \
0     24  34.7                       1                       25.2   
1     37  26.4                       0                       57.1   
2     32  23.6                       0                       92.7   
3     28  28.8                       0                       63.1   
4     25  22.1                       1                       59.8   
..   ...   ...                     ...                        ...   
995   34  18.4                       1                       95.7   
996   45  28.9                       1                       28.5   
997   37  28.3                       0                       32.4   
998   41  27.3                       0                       95.6   
999   22  21.9                       1                       78.9   

     Antral_Follicle_Count  PCOS_Diagnosis  
0                       20               0  
1                       25               0  
2                    

In [3]:
# Step 2: Create a new column "Pass/Fail"
students_df['Pass/Fail'] = students_df['Grade'].apply(lambda x: 'Pass' if x >= 50 else 'Fail')

In [4]:
# Step 3: Normalise the Attendance column to a 0–1 scale
students_df['Normalized_Attendance'] = students_df['Attendance'] / 100

In [5]:
# Step 4: Create a new column "Name_Length" with the length of each student's name
students_df['Name_Length'] = students_df['Name'].apply(len)

In [6]:
# Display the updated DataFrame with new columns
print("\nUpdated Data with New Columns:")
print(students_df.head())



Updated Data with New Columns:
    Name  Grade  Attendance  Gender Pass/Fail  Normalized_Attendance  \
0  Henry     78   59.886547    Male      Pass               0.598865   
1    Eve     91   69.478493  Female      Pass               0.694785   
2   Jane     68   88.046784  Female      Pass               0.880468   
3    Bob     54   62.435405    Male      Pass               0.624354   
4    Ivy     82   29.563369  Female      Pass               0.295634   

   Name_Length  
0            5  
1            3  
2            4  
3            3  
4            3  


In [7]:
# Step 5: Group data by Gender and calculate mean Grade and total Attendance
gender_grouped = students_df.groupby('Gender').agg({'Grade': 'mean', 'Attendance': 'sum'}).reset_index()
print("\nGrouped Data by Gender (Mean Grade and Total Attendance):")
print(gender_grouped)


Grouped Data by Gender (Mean Grade and Total Attendance):
   Gender      Grade   Attendance
0  Female  70.262626  4956.847837
1    Male  68.405941  5015.884293


In [8]:
# Step 6: Create a Pivot Table to show the average Grade and Attendance for each Gender
pivot_table = pd.pivot_table(students_df, values=['Grade', 'Attendance'], index='Gender', aggfunc={'Grade': 'mean', 'Attendance': 'mean'})
print("\nPivot Table (Average Grade and Attendance by Gender):")
print(pivot_table)


Pivot Table (Average Grade and Attendance by Gender):
        Attendance      Grade
Gender                       
Female   50.069170  70.262626
Male     49.662221  68.405941


In [9]:
# Optional: Save the transformed dataset to a new CSV file
output_file_path = r"transformed_students_data.csv"
students_df.to_csv(output_file_path, index=False)

In [10]:
print("\nTransformed dataset has been saved to:", output_file_path)


Transformed dataset has been saved to: transformed_students_data.csv


In [11]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['John', 'Alice', 'Bob', 'Jane', 'Charlie'],
    'Attendance': [60, 80, 90, 50, 70]
}
df = pd.DataFrame(data)

# Normalize the 'Attendance' column to 0-1 range
df['Normalized_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())

# Display the result
print(df)

      Name  Attendance  Normalized_Attendance
0     John          60                   0.25
1    Alice          80                   0.75
2      Bob          90                   1.00
3     Jane          50                   0.00
4  Charlie          70                   0.50
