In [None]:
# Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the Dataset
df = pd.read_csv('adult.csv')  # Make sure this file is in your working directory
df.columns = df.columns.str.strip()  # Clean column names by removing extra spaces

# Step 3: Basic Exploration
print("First 5 rows of the dataset:")
print(df.head())

print("\nColumn Names:")
print(df.columns)

print("\nValue counts for 'sex':")
print(df['sex'].value_counts())

print("\nValue counts for 'income':")
print(df['income'].value_counts())

# Step 4: Check for Missing Values
print("\nMissing values per column:")
print(df.isnull().sum())

# Step 5: Analyze Gender Representation Across Income Levels
gender_income_distribution = pd.crosstab(df['sex'], df['income'], normalize='columns') * 100
print("\nGender distribution across income levels (%):")
print(gender_income_distribution)

# Step 6: Visualize the Gender Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='income', hue='sex')
plt.title("Gender Distribution Across Income Levels")
plt.xlabel("Income Level")
plt.ylabel("Count")
plt.legend(title="Gender")
plt.tight_layout()
plt.show()

# Step 7: Summary
print("\nSummary:")
print("This analysis shows how men and women are represented in the income groups.")
print("If there is a significantly lower percentage of females in the >50K group compared to the <=50K group, this may indicate a representation bias.")