# Practice

### Dataset for practice

In [1]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()


 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


- Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).

In [2]:
# Count missing values in each column
print(df_untidy.isnull().sum())


Score        60
Height_cm    60
Rating       60
Fruit         0
Group         0
IsActive      0
dtype: int64


- Q2.  Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.

In [3]:
# Select categorical/object dtype columns
cat_cols = df_untidy.select_dtypes(include=['object']).columns
print("Categorical Columns:", cat_cols.tolist())


Categorical Columns: ['Height_cm', 'Rating', 'Fruit', 'Group', 'IsActive']


In [4]:
# Clean Height_cm -> remove "cm" and convert to float
df_untidy['Height_cm_clean'] = df_untidy['Height_cm'].replace('cm', '', regex=True).astype(float)

# Clean Rating -> remove "Rating: " and convert to float
df_untidy['Rating_clean'] = df_untidy['Rating'].replace('Rating: ', '', regex=True).astype(float)


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_untidy['IsActive_encoded'] = le.fit_transform(df_untidy['IsActive'].fillna("Unknown"))
df_untidy[['IsActive', 'IsActive_encoded']].head()


Unnamed: 0,IsActive,IsActive_encoded
0,Yes,1
1,No,0
2,No,0
3,,2
4,No,0


- Q3. Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.

In [6]:
# Check column dtypes
print(df_untidy.dtypes)

# Look for object columns that should be numeric
for col in df_untidy.columns:
    if df_untidy[col].dtype == 'object':
        print(f"\nUnique sample values in {col}:")
        print(df_untidy[col].dropna().unique()[:10])  # show first 10 unique values


Score               float64
Height_cm            object
Rating               object
Fruit                object
Group                object
IsActive             object
Height_cm_clean     float64
Rating_clean        float64
IsActive_encoded      int64
dtype: object

Unique sample values in Height_cm:
[16.480033632478424 49.244710931099114 '31.599083317303908cm'
 45.22428568706372 25.652659700690286 36.253727839177266
 23.078727432918583 17.175606986186637 28.672395098479146
 20.531241452266364]

Unique sample values in Rating:
[2.0 'Rating: 5.0' 3.0 1.0 5.0 'Rating: 2.0' 4.0 'Rating: 3.0'
 'Rating: 1.0' 'Rating: 4.0']

Unique sample values in Fruit:
['banana' 'apple' 'grape' 'nan']

Unique sample values in Group:
['D' 'A' 'B' 'C']

Unique sample values in IsActive:
['Yes' 'No' 'nan']


In [7]:
df_untidy['Height_cm_clean'] = (
    df_untidy['Height_cm']
    .astype(str)                 # ensure string for replace
    .str.replace('cm', '', regex=False)
    .replace('nan', np.nan)      # turn "nan" back into np.nan
    .astype(float)
)


In [8]:
df_untidy['Rating_clean'] = (
    df_untidy['Rating']
    .astype(str)
    .str.replace('Rating: ', '', regex=False)
    .replace('nan', np.nan)
    .astype(float)
)


In [9]:
print(df_untidy[['Height_cm', 'Height_cm_clean']].head(10))
print(df_untidy[['Rating', 'Rating_clean']].head(10))


              Height_cm  Height_cm_clean
0                   NaN              NaN
1             16.480034        16.480034
2             49.244711        49.244711
3                   NaN              NaN
4  31.599083317303908cm        31.599083
5                   NaN              NaN
6             45.224286        45.224286
7              25.65266        25.652660
8             36.253728        36.253728
9                   NaN              NaN
        Rating  Rating_clean
0          2.0           2.0
1  Rating: 5.0           5.0
2          NaN           NaN
3          3.0           3.0
4          NaN           NaN
5          1.0           1.0
6          3.0           3.0
7          3.0           3.0
8          5.0           5.0
9          2.0           2.0


- Q4. Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.

In [10]:
num_cols = ['Score', 'Height_cm_clean', 'Rating_clean']
print(df_untidy[num_cols].head())
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_minmax = pd.DataFrame(
    scaler.fit_transform(df_untidy[num_cols]),
    columns=[col + '_minmax' for col in num_cols]
)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standard = pd.DataFrame(
    scaler.fit_transform(df_untidy[num_cols]),
    columns=[col + '_std' for col in num_cols]
)
df_scaled = pd.concat([df_untidy, df_minmax, df_standard], axis=1)
print(df_scaled.head())


      Score  Height_cm_clean  Rating_clean
0 -1.085631              NaN           2.0
1  0.997345        16.480034           5.0
2  0.282978        49.244711           NaN
3 -1.506295              NaN           3.0
4 -0.578600        31.599083           NaN
      Score             Height_cm       Rating   Fruit Group IsActive  \
0 -1.085631                   NaN          2.0  banana     D      Yes   
1  0.997345             16.480034  Rating: 5.0   apple     A       No   
2  0.282978             49.244711          NaN  banana     B       No   
3 -1.506295                   NaN          3.0   grape     D      nan   
4 -0.578600  31.599083317303908cm          NaN  banana     C       No   

   Height_cm_clean  Rating_clean  IsActive_encoded  Score_minmax  \
0              NaN           2.0                 1      0.346613   
1        16.480034           5.0                 0      0.683137   
2        49.244711           NaN                 0      0.567725   
3              NaN           3.

- Q5. Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.

In [11]:
import numpy as np
import pandas as pd

def validate_dataframe(df, numeric_ranges=None):
    """
    Validate a cleaned DataFrame for analysis readiness.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to check.
    numeric_ranges : dict, optional
        Dictionary of expected numeric ranges, e.g.,
        {"Rating_clean": (1, 5), "Height_cm_clean": (0, 300)}
    
    Returns:
    --------
    report : dict
        Summary of issues found.
    """
    report = {}

    # 1. Missing values
    missing = df.isnull().sum()
    report["missing_values"] = missing[missing > 0].to_dict()

    # 2. Mixed types
    mixed_types = {}
    for col in df.columns:
        types = df[col].dropna().map(type).unique()
        if len(types) > 1:
            mixed_types[col] = types
    report["mixed_types"] = mixed_types

    # 3. Out-of-range values
    out_of_range = {}
    if numeric_ranges:
        for col, (low, high) in numeric_ranges.items():
            if col in df.columns:
                bad_vals = df[(df[col] < low) | (df[col] > high)][col]
                if not bad_vals.empty:
                    out_of_range[col] = bad_vals.tolist()
    report["out_of_range"] = out_of_range

    return report


In [12]:
# Define expected ranges
ranges = {
    "Rating_clean": (1, 5),       # ratings must be between 1 and 5
    "Height_cm_clean": (0, 300),  # heights in cm must be reasonable
}

# Validate
report = validate_dataframe(df_untidy, numeric_ranges=ranges)
print(report)


{'missing_values': {'Score': 60, 'Height_cm': 60, 'Rating': 60, 'Height_cm_clean': 60, 'Rating_clean': 60}, 'mixed_types': {'Height_cm': array([<class 'float'>, <class 'str'>], dtype=object), 'Rating': array([<class 'float'>, <class 'str'>], dtype=object)}, 'out_of_range': {}}
