In [15]:
# Assignment 1 : 
#Implementation of the Data Science Lifecycle and 
#Data Type Classification Using the Titanic Dataset

In [16]:
# Importing necessary libraries
# for data manipulation and analysis
# for numerical operations (used here if needed)



# Importing the pandas library and assigning it the alias 'pd'
# pandas is one of the most widely used libraries in data science for working with structured data.
# It provides two primary data structures:
# - Series: a one-dimensional array (like a column in Excel or a list with labels)
# - DataFrame: a two-dimensional table with labeled rows and columns (like a spreadsheet or SQL table)
# Common pandas features include:
# - Reading data from CSV, Excel, SQL, JSON, and more
# - Cleaning and preprocessing data (e.g., handling missing values, renaming columns)
# - Filtering, grouping, and aggregating data
# - Merging, joining, and reshaping datasets
# - Performing descriptive statistics and exporting clean data
import pandas as pd

# Importing the numpy library and assigning it the alias 'np'
# numpy is the foundational library for numerical and scientific computing in Python.
# It is especially useful for:
# - Creating and manipulating numerical arrays (1D, 2D, or multidimensional)
# - Performing fast element-wise operations and mathematical functions
# - Handling missing values using np.nan, np.isnan, np.nanmean, etc.
# - Generating random numbers (important in simulations, modeling, and ML)
# - Serving as a backend for other libraries like pandas, scikit-learn, TensorFlow, and more
# In data science, numpy is often used behind the scenes for speed and efficiency
import numpy as np

In [17]:
# ---------------------------------------------
# 1. DATA COLLECTION
# ---------------------------------------------

# Importing the Titanic dataset using pandas
# This step loads structured data from a CSV (Comma-Separated Values) file into a DataFrame.
# A DataFrame is similar to a table in Excel or a SQL database: it has rows (records) and columns (features).
# 'read_csv()' is one of pandas' most commonly used functions to read tabular data.
# NOTE:
# - Make sure the file path is correct and that the file exists at that location.
# - On Windows, use double backslashes (\\) or a raw string (prefix with 'r') to avoid errors with escape characters.
# - If the file is in the same directory as your script, you can simply use the filename (e.g., "Titanic.csv").
df = pd.read_csv("C:\MITADT\ISDL\Lab 1\Titanic.csv")  # Load CSV into a DataFrame

# Display the first 5 rows of the dataset
# 'head()' is used to quickly inspect the data.
# This helps you:
# - Understand what kind of information is present (e.g., names, ages, ticket class, survival status)
# - Identify column names and types of data (numerical, categorical, text)
# - Spot any immediate issues like missing data, inconsistent formatting, or irrelevant columns
# It’s a key step before proceeding with data cleaning or analysis.
print("First 5 rows of the dataset:")
print(df.head())  # Default shows 5 rows; you can pass a number (e.g., head(10)) to see more rows


First 5 rows of the dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450

In [18]:
# ---------------------------------------------
# 2. DATA TYPE CLASSIFICATION
# ---------------------------------------------

# Displaying basic information about the dataset using df.info()
# This method provides a concise summary of the DataFrame, including:
# - Total number of entries (rows)
# - Number of non-null (non-missing) values in each column
# - Data type of each column (e.g., int64, float64, object)
# - Memory usage of the DataFrame

# WHY THIS IS IMPORTANT:
# - Helps identify which columns contain missing data
# - Reveals data types (very important for further processing):
#     • 'object' typically means text/categorical data
#     • 'int64' and 'float64' are numerical (used in calculations and modeling)
#     • 'bool' for binary values
#     • 'datetime64' if dates are present (can be parsed later if needed)
# - Useful for detecting unexpected types (e.g., a numeric-looking column loaded as object due to formatting issues)

print("\nData types and non-null counts:")
print(df.info())  # Outputs structure and metadata about the DataFrame



Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [19]:
# ----------------------------------------------------
# Structure-based Classification:
# ----------------------------------------------------

# This is a structured dataset.
# Structured data is organized into rows and columns, much like a table in a database or a spreadsheet.
# Each column represents a feature (also called a variable or attribute),
# and each row represents an observation (or record, like a single passenger on the Titanic).
# Structured data is easily processed using pandas and is ideal for statistical analysis and machine learning models.

# Example of structured format:
# | PassengerId | Name        | Age | Sex   | Survived |
# |-------------|-------------|-----|-------|----------|
# | 1           | John Smith  | 22  | male  | 0        |

# ----------------------------------------------------
# Measurement Scale-based Classification (manual):
# ----------------------------------------------------

# Variables in datasets can also be classified by their *level of measurement* (or scale).
# There are four common types of scales:
# - Nominal: Categorical variables with no inherent order (e.g., Sex, Embarked)
# - Ordinal: Categorical variables with a clear order or ranking (e.g., Pclass = 1st, 2nd, 3rd)
# - Interval: Numeric data with meaningful differences, but no true zero (e.g., Temperature in Celsius)
# - Ratio: Numeric data with meaningful zero and ratios (e.g., Age, Fare)

print("\nMeasurement Scale Classification of Each Column:")




Measurement Scale Classification of Each Column:


In [20]:
# -------------------------------------------------------------
# Creating a Summary Table: Column Names, Data Types, and Scales
# -------------------------------------------------------------

# Step 1: Manually define the measurement scale for each column
# This uses domain knowledge of the Titanic dataset and data types.
# It's crucial for deciding how to preprocess each feature (e.g., encoding categorical variables).
# defines a Python dictionary called measurement_scale, which is used to manually classify each column in the Titanic dataset
# based on its level of measurement in statistics. 
#This classification helps in choosing the correct preprocessing, analysis, and modeling techniques.
measurement_scale = {
    'PassengerId': 'Nominal',   # Acts as a unique ID for each passenger
                                # Treated as a label, not used for modeling

    'Survived': 'Nominal',      # Binary categorical variable: 0 = No, 1 = Yes
                                # Can be treated as categorical for classification tasks

    'Pclass': 'Ordinal',        # Passenger class: 1st > 2nd > 3rd
                                # Ranked categories with implied socioeconomic status

    'Name': 'Nominal',          # Passenger names (textual, unique identifiers)
                                # Not useful as-is for modeling; may be used to extract titles

    'Sex': 'Nominal',           # Categorical variable: male, female
                                # Needs encoding before modeling

    'Age': 'Ratio',             # Continuous numeric variable with a true zero
                                # Can be used directly in mathematical operations or scaling

    'SibSp': 'Ratio',           # Count of siblings/spouses aboard
                                # Whole number count, meaningful zero

    'Parch': 'Ratio',           # Count of parents/children aboard
                                # Also a count feature with a true zero

    'Ticket': 'Nominal',        # Ticket number/code
                                # No meaningful numeric or ordered structure

    'Fare': 'Ratio',            # Fare paid, continuous numerical value
                                # Can be used for analysis, needs scaling for some models

    'Cabin': 'Nominal',         # Cabin identifiers (e.g., C85, E46)
                                # Textual and highly missing — often dropped or simplified

    'Embarked': 'Nominal'       # Port of embarkation: C (Cherbourg), Q (Queenstown), S (Southampton)
                                # Categorical variable with no natural order
}

# Step 2: Create a DataFrame that summarizes:
# - Column Name
# - Pandas-inferred Data Type (e.g., object, float64, int64)
# - Manually assigned Measurement Scale
# Create a Summary Table (column_info) of Titanic Dataset
# -------------------------------------------------------------
# This code creates a new DataFrame that summarizes key metadata
# for each column in your dataset: column name, data type, and
# measurement scale (Nominal, Ordinal, or Ratio).
# This helps in understanding how to process each column correctly.

column_info = pd.DataFrame({
    
    # Column Name: List of all column names from the original DataFrame (df)
    # Example: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', ...]
    'Column Name': df.columns,

    # Data Type (Pandas): Shows the type of data in each column, as inferred by pandas.
    # These include:
    # - int64: integers
    # - float64: decimal numbers
    # - object: strings (text)
    # - category: optimized type for categorical variables
    'Data Type (Pandas)': df.dtypes.values,

    # Measurement Scale: Manually defined scale (Nominal, Ordinal, or Ratio)
    # This comes from your earlier measurement_scale dictionary.
    # It tells you how each column should be interpreted statistically:
    # - Nominal: Categories without order (e.g., Sex, Embarked)
    # - Ordinal: Categories with order (e.g., Pclass)
    # - Ratio: Numeric data with a meaningful zero (e.g., Age, Fare)
    'Measurement Scale': [measurement_scale[col] for col in df.columns]
})



#  Why this is useful:
# - Helps you understand your data structure
# - Guides preprocessing (e.g., encoding, scaling, dropping)
# - Provides clear documentation of how your data is interpreted

# Step 3: Display the summary table
# This makes it easy to:
# - Check which columns are numerical or categorical
# - Decide how to handle each column during preprocessing
print("\n Summary of Column Types and Scales:")
print(column_info)




 Summary of Column Types and Scales:
    Column Name Data Type (Pandas) Measurement Scale
0   PassengerId              int64           Nominal
1      Survived              int64           Nominal
2        Pclass              int64           Ordinal
3          Name             object           Nominal
4           Sex             object           Nominal
5           Age            float64             Ratio
6         SibSp              int64             Ratio
7         Parch              int64             Ratio
8        Ticket             object           Nominal
9          Fare            float64             Ratio
10        Cabin             object           Nominal
11     Embarked             object           Nominal


In [21]:
# ---------------------------------------------
# 3. DATA PREPROCESSING
# ---------------------------------------------

# a) Identifying Missing Values

# Missing values (NaNs) can cause errors or misleading results during analysis and modeling.
# It's important to find out which columns have missing data and how many entries are affected.
# The method 'isnull()' returns a DataFrame of the same shape as df,
# with True where values are missing (NaN) and False otherwise.
# By chaining 'sum()', we get the total count of missing values per column.

print("\nMissing values in each column:")
print(df.isnull().sum())

# Understanding missing data helps you decide:
# - Whether to remove rows or columns with missing data
# - Whether to fill missing values with statistics (mean, median, mode)
# - Whether to apply more advanced imputation methods or flag missingness




Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [22]:
# ---------------------------------------------
# 3. DATA PREPROCESSING 
# ---------------------------------------------

# b) Handling Missing Values

# Fill missing values in 'Age' column with the median value of 'Age'
# Why median?
# - Age data is often skewed and may contain outliers.
# - Median is the middle value and is robust to outliers, unlike the mean.
# - Filling missing 'Age' values prevents losing rows during analysis and modeling.

df['Age'].fillna(df['Age'].median(), inplace=True)  

# inplace=True means changes are applied directly to df without needing reassignment.

# Fill missing values in 'Embarked' column with the mode (most frequent category)
# Why mode for categorical data?
# - 'Embarked' represents categories (ports of embarkation: C, Q, S).
# - Replacing missing values with the most common category helps retain data without biasing much.

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# mode() returns a Series of modes; [0] extracts the top mode value.

# The 'Cabin' column contains many missing values (~75% missing in Titanic dataset).
# Why drop it?
# - Imputing so many missing values may introduce noise or bias.
# - The column may not provide significant predictive power in its raw form.
# - Dropping reduces dimensionality and simplifies preprocessing.

df.drop('Cabin', axis=1, inplace=True)

# axis=1 specifies dropping a column (axis=0 would drop rows).

# c) Basic Data Cleaning and Formatting

# Convert 'Sex' and 'Embarked' columns to categorical data type.
# Benefits:
# - Saves memory compared to using 'object' dtype (strings).
# - Explicitly marks these columns as categorical, which helps some pandas functions and ML algorithms.
# - Some ML libraries detect 'category' dtype and optimize internally.
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')

# Clean column names by stripping whitespace (leading or trailing spaces).
# Why?
# - Accidental spaces can cause errors when selecting columns by name.
# - Ensures consistent column naming for reliable downstream processing.
df.columns = df.columns.str.strip()

# Display the first few rows of the cleaned DataFrame to verify all preprocessing steps.
print("\nDataset after preprocessing:")
print(df.head())





Dataset after preprocessing:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Embarked  
0      0         A/5 21171   7.2500        S  
1      0          PC 17599  71.2833        C  
2      0  STON/O2. 3101282   7.9250        S  
3      0            113803  53.1000        S  
4      0            373450   8.0500        S  


In [23]:
# Final check for any remaining missing values
print("\nRemaining missing values after preprocessing:")
print(df.isnull().sum())




Remaining missing values after preprocessing:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [24]:
# Display updated data types
print("\nUpdated Data Types after cleaning:")
print(df.dtypes)



Updated Data Types after cleaning:
PassengerId       int64
Survived          int64
Pclass            int64
Name             object
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Embarked       category
dtype: object
