In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Task 1

In [3]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## Identifying and Imputing Missing Data 
     • Locate Missing Data: Examine the dataset to locate any missing values. Identify the columns with missing data and report how many missing   values are present in each column. 
     • Handle Missing Data in Numerical Columns: Fill in missing values for numeric columns (i.e., sepal_length, sepal_width, petal_length, petal_width) using the median value of each column. Justify why you chose this approach. 
     • Handle Missing Data in Categorical Columns: Identify if there are missing values in the species column. If so, impute them with the most frequent value (mode) in the column.

In [4]:
missing_data = df.isnull().sum()
print("Missing values per column:\n",missing_data)

for col in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)

if df['Species'].isnull().any():
    mode_value = df['Species'].mode()[0]
    df['Species'].fillna(mode_value, inplace=True)

Missing values per column:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


## Task 2

## Data Integrity and Transformation 
    • Remove Duplicate Records: Review the dataset for duplicate rows (where all values in a row are identical) and remove any duplicates found. Ensure that only one unique record per flower remains in the dataset. 
    • Feature Engineering: Create a new feature called total_area by adding the areas of both the sepal and petal. To do this, create separate columns for the sepal area and petal area and then add them to form the total_area column. 
    • Handling Missing Values Again: After imputing missing data, inspect the dataset again and drop any rows that still have missing values in any of the columns. 

In [6]:
df = df.drop_duplicates()

df['sepal_area'] = df['SepalLengthCm'] * df['SepalWidthCm']
df['petal_area'] = df['PetalLengthCm'] * df['PetalWidthCm']
df['total_area'] = df['sepal_area'] + df['petal_area']

if df.isnull().any().any():
    df = df.dropna()

In [7]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,sepal_area,petal_area,total_area
0,1,5.1,3.5,1.4,0.2,Iris-setosa,17.85,0.28,18.13
1,2,4.9,3.0,1.4,0.2,Iris-setosa,14.70,0.28,14.98
2,3,4.7,3.2,1.3,0.2,Iris-setosa,15.04,0.26,15.30
3,4,4.6,3.1,1.5,0.2,Iris-setosa,14.26,0.30,14.56
4,5,5.0,3.6,1.4,0.2,Iris-setosa,18.00,0.28,18.28
...,...,...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,20.10,11.96,32.06
146,147,6.3,2.5,5.0,1.9,Iris-virginica,15.75,9.50,25.25
147,148,6.5,3.0,5.2,2.0,Iris-virginica,19.50,10.40,29.90
148,149,6.2,3.4,5.4,2.3,Iris-virginica,21.08,12.42,33.50


## Task 3

## Aggregation and Data Transformation 
    • Numerical Conversion of Categorical Data: Convert the species column, which is categorical, into a numerical format by assigning each species a unique number (e.g., 0, 1, 2). 
    • Apply Grouped Aggregation: Using the transformed data, group the flowers by species and calculate the total sum of the numeric columns (sepal_length, sepal_width, petal_length, petal_width). Present the results in a table that shows the sum of each feature per species.

In [9]:
df['species_numeric'] = df['Species'].astype('category').cat.codes
aggregation = df.groupby('species_numeric')[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].sum()
print("Grouped Aggregation (Sum of Features by Species):")
print(aggregation)

Grouped Aggregation (Sum of Features by Species):
                 SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
species_numeric                                                          
0                        250.3         170.9           73.2          12.2
1                        296.8         138.5          213.0          66.3
2                        329.4         148.7          277.6         101.3


## Task 4

## Data Reshaping 
    • Reshape the Dataset into a Long Format: Reshape the dataset so that each flower's attributes (sepal length, sepal width, etc.) are stacked in a single column, with a new column indicating the attribute type (e.g., sepal_length, sepal_width, etc.).

In [11]:
df_long = pd.melt(df, id_vars=['Species', 'species_numeric'], 
                  value_vars=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'sepal_area', 'petal_area', 'total_area'],
                  var_name='attribute', value_name='value')

print("Reshaped Dataset (Long Format):")
print(df_long.head())


cleaned_filename = 'iris_cleaned.csv'
df.to_csv(cleaned_filename, index=False)
print(f"Cleaned dataset saved to {cleaned_filename}")

reshaped_filename = 'iris_long_format.csv'
df_long.to_csv(reshaped_filename, index=False)
print(f"Reshaped dataset saved to {reshaped_filename}")


Reshaped Dataset (Long Format):
       Species  species_numeric      attribute  value
0  Iris-setosa                0  SepalLengthCm    5.1
1  Iris-setosa                0  SepalLengthCm    4.9
2  Iris-setosa                0  SepalLengthCm    4.7
3  Iris-setosa                0  SepalLengthCm    4.6
4  Iris-setosa                0  SepalLengthCm    5.0
Cleaned dataset saved to iris_cleaned.csv
Reshaped dataset saved to iris_long_format.csv
