In [2]:
import pandas as  pd


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]
df  = pd.read_csv(url, names=columns, sep=',\s*', engine= "python" )

In [3]:
df.shape[1]

15

In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [17]:
df.columns= df.columns.str.title()
df


Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [25]:
df.dtypes

Age                int64
Workclass         object
Fnlwgt             int64
Education         object
Education-Num      int64
Marital-Status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Capital-Gain       int64
Capital-Loss       int64
Hours-Per-Week     int64
Native-Country    object
Income            object
dtype: object

In [None]:
def quick_data_check(df):
    """
    Quickly inspects DataFrame for common text/data issues:
    - Leading/trailing spaces
    - Multiple internal spaces
    - Mixed casing
    - Strange characters
    - Unique value countr
    """
    report = []

    for col in df.columns:
        col_data = df[col].astype(str)
        report.append({
            "column": col,
            "has_spaces_edges": col_data.str.strip().ne(col_data).any(),
            "has_multiple_spaces_inside": col_data.str.contains(r'\s{2,}').any(),
            "has_mixed_case": col_data.str.contains(r'[A-Z].*[a-z]|[a-z].*[A-Z]').any(),
            "has_special_chars": col_data.str.contains(r'[^a-zA-Z0-9 ,._-]').any(),
            "unique_count": df[col].nunique()
        })

    return pd.DataFrame(report)


In [33]:
df_report = quick_data_check(df)
df_report


Unnamed: 0,column,has_spaces_edges,has_multiple_spaces_inside,has_mixed_case,has_special_chars,unique_count
0,Age,False,False,False,False,73
1,Workclass,False,False,True,True,9
2,Fnlwgt,False,False,False,False,21648
3,Education,False,False,True,False,16
4,Education-Num,False,False,False,False,16
5,Marital-Status,False,False,True,False,7
6,Occupation,False,False,True,True,15
7,Relationship,False,False,True,False,6
8,Race,False,False,True,False,5
9,Sex,False,False,True,False,2


In [35]:
import pandas as pd
import re

def scan_and_clean(df, case_style="title"):
    """
    Scans DataFrame for common text issues and automatically cleans them.
    
    Parameters:
    - df: pandas DataFrame
    - case_style: 'lower', 'upper', or 'title' for standardizing text
    
    Returns:
    - cleaned_df: cleaned DataFrame
    - report_df: DataFrame summarizing what was fixed per column
    """
    df_clean = df.copy()
    report = []

    for col in df.columns:
        col_data = df_clean[col].astype(str)
        col_report = {"column": col}

        # 1️⃣ Leading/trailing spaces
        if col_data.str.strip().ne(col_data).any():
            df_clean[col] = col_data.str.strip()
            col_report["stripped_spaces"] = True
        else:
            col_report["stripped_spaces"] = False

        # 2️⃣ Multiple spaces inside
        if col_data.str.contains(r'\s{2,}').any():
            df_clean[col] = df_clean[col].str.replace(r'\s{2,}', ' ', regex=True)
            col_report["fixed_multiple_spaces"] = True
        else:
            col_report["fixed_multiple_spaces"] = False

        # 3️⃣ Mixed casing
        if col_data.str.contains(r'[A-Z].*[a-z]|[a-z].*[A-Z]').any():
            if case_style.lower() == "title":
                df_clean[col] = df_clean[col].str.title()
            elif case_style.lower() == "lower":
                df_clean[col] = df_clean[col].str.lower()
            elif case_style.lower() == "upper":
                df_clean[col] = df_clean[col].str.upper()
            col_report["fixed_case"] = True
        else:
            col_report["fixed_case"] = False

        # 4️⃣ Weird characters (optional: just flag, do not remove automatically)
        col_report["has_special_chars"] = col_data.str.contains(r'[^a-zA-Z0-9 ,._-]').any()

        # 5️⃣ Unique values
        col_report["unique_count"] = df_clean[col].nunique()

        report.append(col_report)

    report_df = pd.DataFrame(report)
    return df_clean, report_df


In [36]:
clean_df, report = scan_and_clean(df, case_style="title")
print(report)


            column  stripped_spaces  fixed_multiple_spaces  fixed_case  \
0              Age            False                  False       False   
1        Workclass            False                  False        True   
2           Fnlwgt            False                  False       False   
3        Education            False                  False        True   
4    Education-Num            False                  False       False   
5   Marital-Status            False                  False        True   
6       Occupation            False                  False        True   
7     Relationship            False                  False        True   
8             Race            False                  False        True   
9              Sex            False                  False        True   
10    Capital-Gain            False                  False       False   
11    Capital-Loss            False                  False       False   
12  Hours-Per-Week            False   

In [None]:
df